Merge pull request #3785 from jet47:tiny-gpu-module

2015-05-18 08:00:12 +00:00
parent cc94b393c5 f10537cdd6
commit 0410fe6978
67 changed files with 3892 additions and 359 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,6 +194,7 @@ OCV_OPTION(BUILD_WITH_STATIC_CRT    "Enables use of staticaly linked CRT for sta
 OCV_OPTION(BUILD_FAT_JAVA_LIB       "Create fat java wrapper containing the whole OpenCV library" ON IF NOT BUILD_SHARED_LIBS AND CMAKE_COMPILER_IS_GNUCXX )
 OCV_OPTION(BUILD_ANDROID_SERVICE    "Build OpenCV Manager for Google Play" OFF IF ANDROID AND ANDROID_SOURCE_TREE )
 OCV_OPTION(BUILD_ANDROID_PACKAGE    "Build platform-specific package for Google Play" OFF IF ANDROID )
+OCV_OPTION(BUILD_TINY_GPU_MODULE    "Build tiny gpu module with limited image format support" OFF )

 # 3rd party libs
 OCV_OPTION(BUILD_ZLIB               "Build zlib from source"             WIN32 OR APPLE )
@@ -996,6 +997,7 @@ if(HAVE_CUDA)
  status("    NVIDIA GPU arch:"      ${OPENCV_CUDA_ARCH_BIN})
  status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_PTX})
  status("    Use fast math:"        CUDA_FAST_MATH THEN YES ELSE NO)
+  status("    Tiny gpu module:"      BUILD_TINY_GPU_MODULE THEN YES ELSE NO)
 endif()

 if(HAVE_OPENCL)
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -63,6 +63,10 @@ if(OPENCV_CAN_BREAK_BINARY_COMPATIBILITY)
  add_definitions(-DOPENCV_CAN_BREAK_BINARY_COMPATIBILITY)
 endif()

+if(BUILD_TINY_GPU_MODULE)
+  add_definitions(-DOPENCV_TINY_GPU_MODULE)
+endif()
+
 if(CMAKE_COMPILER_IS_GNUCXX)
  # High level of warnings.
  add_extra_compiler_option(-W)
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
@@ -46,7 +46,11 @@ using namespace std;
 using namespace testing;
 using namespace perf;

+#ifdef OPENCV_TINY_GPU_MODULE
+#define ARITHM_MAT_DEPTH Values(CV_8U, CV_32F)
+#else
 #define ARITHM_MAT_DEPTH Values(CV_8U, CV_16U, CV_32F, CV_64F)
+#endif

 //////////////////////////////////////////////////////////////////////
 // Merge
@@ -524,9 +528,17 @@ PERF_TEST_P(Sz_Depth, Core_AbsDiffScalar,
 //////////////////////////////////////////////////////////////////////
 // Abs

-PERF_TEST_P(Sz_Depth, Core_Abs,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Abs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Abs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_16S, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -552,9 +564,17 @@ PERF_TEST_P(Sz_Depth, Core_Abs,
 //////////////////////////////////////////////////////////////////////
 // Sqr

-PERF_TEST_P(Sz_Depth, Core_Sqr,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Sqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -580,9 +600,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqr,
 //////////////////////////////////////////////////////////////////////
 // Sqrt

-PERF_TEST_P(Sz_Depth, Core_Sqrt,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Sqrt, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -612,9 +640,17 @@ PERF_TEST_P(Sz_Depth, Core_Sqrt,
 //////////////////////////////////////////////////////////////////////
 // Log

-PERF_TEST_P(Sz_Depth, Core_Log,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Log, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Log, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -644,9 +680,17 @@ PERF_TEST_P(Sz_Depth, Core_Log,
 //////////////////////////////////////////////////////////////////////
 // Exp

-PERF_TEST_P(Sz_Depth, Core_Exp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16S, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_Exp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_Exp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16S, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -678,10 +722,19 @@ PERF_TEST_P(Sz_Depth, Core_Exp,

 DEF_PARAM_TEST(Sz_Depth_Power, cv::Size, MatDepth, double);

-PERF_TEST_P(Sz_Depth_Power, Core_Pow,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F)),
+    Values(0.3, 2.0, 2.4)
+))
+#else
+PERF_TEST_P(Sz_Depth_Power, Core_Pow, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16S, CV_32F),
-                    Values(0.3, 2.0, 2.4)))
+    Values(0.3, 2.0, 2.4)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -859,10 +912,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseAndMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseAndScalar

-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_8U)),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseAndScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -935,10 +997,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseOrMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseOrScalar

-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_8U)),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseOrScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1011,10 +1082,19 @@ PERF_TEST_P(Sz_Depth, Core_BitwiseXorMat,
 //////////////////////////////////////////////////////////////////////
 // BitwiseXorScalar

-PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_8U)),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_BitwiseXorScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1155,9 +1235,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMat,
 //////////////////////////////////////////////////////////////////////
 // MinScalar

-PERF_TEST_P(Sz_Depth, Core_MinScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MinScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1226,9 +1314,17 @@ PERF_TEST_P(Sz_Depth, Core_MaxMat,
 //////////////////////////////////////////////////////////////////////
 // MaxScalar

-PERF_TEST_P(Sz_Depth, Core_MaxScalar,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MaxScalar, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1263,11 +1359,21 @@ PERF_TEST_P(Sz_Depth, Core_MaxScalar,

 DEF_PARAM_TEST(Sz_3Depth, cv::Size, MatDepth, MatDepth, MatDepth);

-PERF_TEST_P(Sz_3Depth, Core_AddWeighted,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(MatDepth(CV_32F)),
+    Values(MatDepth(CV_32F)),
+    Values(MatDepth(CV_32F))
+))
+#else
+PERF_TEST_P(Sz_3Depth, Core_AddWeighted, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth1 = GET_PARAM(1);
@@ -1782,10 +1888,19 @@ PERF_TEST_P(Sz, Core_MeanStdDev,

 DEF_PARAM_TEST(Sz_Depth_Norm, cv::Size, MatDepth, NormType);

-PERF_TEST_P(Sz_Depth_Norm, Core_Norm,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#else
+PERF_TEST_P(Sz_Depth_Norm, Core_Norm, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32S, CV_32F),
-                    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+    Values(NormType(cv::NORM_INF), NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1859,10 +1974,19 @@ PERF_TEST_P(Sz_Norm, Core_NormDiff,
 //////////////////////////////////////////////////////////////////////
 // Sum

-PERF_TEST_P(Sz_Depth_Cn, Core_Sum,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_Sum, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1896,10 +2020,19 @@ PERF_TEST_P(Sz_Depth_Cn, Core_Sum,
 //////////////////////////////////////////////////////////////////////
 // SumAbs

-PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1929,10 +2062,19 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumAbs,
 //////////////////////////////////////////////////////////////////////
 // SumSqr

-PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values<MatDepth>(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    testing::Values(MatCn(Gray))
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F),
+    GPU_CHANNELS_1_3_4
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1962,9 +2104,17 @@ PERF_TEST_P(Sz_Depth_Cn, Core_SumSqr,
 //////////////////////////////////////////////////////////////////////
 // MinMax

-PERF_TEST_P(Sz_Depth, Core_MinMax,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MinMax, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -2000,9 +2150,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMax,
 //////////////////////////////////////////////////////////////////////
 // MinMaxLoc

-PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_MinMaxLoc, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -2040,9 +2198,17 @@ PERF_TEST_P(Sz_Depth, Core_MinMaxLoc,
 //////////////////////////////////////////////////////////////////////
 // CountNonZero

-PERF_TEST_P(Sz_Depth, Core_CountNonZero,
-            Combine(GPU_TYPICAL_MAT_SIZES,
-                    Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F)
+))
+#else
+PERF_TEST_P(Sz_Depth, Core_CountNonZero, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_16U, CV_32F, CV_64F)
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -2079,12 +2245,23 @@ CV_ENUM(ReduceDim, Rows, Cols)

 DEF_PARAM_TEST(Sz_Depth_Cn_Code_Dim, cv::Size, MatDepth, MatCn, ReduceCode, ReduceDim);

-PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    Values(1, 2, 3, 4),
+    ReduceCode::all(),
+    ReduceDim::all()
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_16S, CV_32F),
    Values(1, 2, 3, 4),
    ReduceCode::all(),
-                    ReduceDim::all()))
+    ReduceDim::all()
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -2120,13 +2297,25 @@ PERF_TEST_P(Sz_Depth_Cn_Code_Dim, Core_Reduce,

 DEF_PARAM_TEST(Sz_Depth_NormType, cv::Size, MatDepth, NormType);

-PERF_TEST_P(Sz_Depth_NormType, Core_Normalize,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    Values(NormType(cv::NORM_INF),
+           NormType(cv::NORM_L1),
+           NormType(cv::NORM_L2),
+           NormType(cv::NORM_MINMAX))
+))
+#else
+PERF_TEST_P(Sz_Depth_NormType, Core_Normalize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
    Values(NormType(cv::NORM_INF),
           NormType(cv::NORM_L1),
           NormType(cv::NORM_L2),
-                           NormType(cv::NORM_MINMAX))))
+           NormType(cv::NORM_MINMAX))
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int type = GET_PARAM(1);
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -145,9 +145,17 @@ PERF_TEST_P(Image_NFeatures, Features2D_ORB,

 DEF_PARAM_TEST(DescSize_Norm, int, NormType);

-PERF_TEST_P(DescSize_Norm, Features2D_BFMatch,
-            Combine(Values(64, 128, 256),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))
+))
+#else
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))
+))
+#endif
 {
    declare.time(20.0);

@@ -202,10 +210,19 @@ static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, s

 DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);

-PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
-            Combine(Values(64, 128, 256),
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
    Values(2, 3),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+    Values(NormType(cv::NORM_L2))
+))
+#else
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#endif
 {
    declare.time(30.0);

@@ -257,9 +274,17 @@ PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
 //////////////////////////////////////////////////////////////////////
 // BFRadiusMatch

-PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch,
-            Combine(Values(64, 128, 256),
-                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L2))
+))
+#else
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(
+    Values(64, 128, 256),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))
+))
+#endif
 {
    declare.time(30.0);

--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -87,7 +87,19 @@ PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
 //////////////////////////////////////////////////////////////////////
 // Sobel

-PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7)
+))
+#else
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7, 9, 11, 13, 15)
+))
+#endif
 {
    declare.time(20.0);

@@ -154,7 +166,19 @@ PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U
 //////////////////////////////////////////////////////////////////////
 // GaussianBlur

-PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7)
+))
+#else
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8UC1, CV_8UC4, CV_32FC1),
+    Values(3, 5, 7, 9, 11, 13, 15)
+))
+#endif
 {
    declare.time(20.0);

--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -91,13 +91,25 @@ void generateMap(cv::Mat& map_x, cv::Mat& map_y, int remapMode)

 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border_Mode, cv::Size, MatDepth, MatCn, Interpolation, BorderMode, RemapMode);

-PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    ALL_BORDER_MODES,
+    RemapMode::all()
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    ALL_BORDER_MODES,
-                    RemapMode::all()))
+    RemapMode::all()
+))
+#endif
 {
    declare.time(20.0);

@@ -143,12 +155,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border_Mode, ImgProc_Remap,

 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Scale, cv::Size, MatDepth, MatCn, Interpolation, double);

-PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    Values(0.5, 0.3, 2.0)
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    Values(0.5, 0.3, 2.0)))
+    Values(0.5, 0.3, 2.0)
+))
+#endif
 {
    declare.time(20.0);

@@ -187,11 +210,21 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Scale, ImgProc_Resize,

 DEF_PARAM_TEST(Sz_Depth_Cn_Scale, cv::Size, MatDepth, MatCn, double);

-PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(0.2, 0.1, 0.05)
+))
+#else
+PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
-                    Values(0.2, 0.1, 0.05)))
+    Values(0.2, 0.1, 0.05)
+))
+#endif
 {
    declare.time(1.0);

@@ -230,12 +263,23 @@ PERF_TEST_P(Sz_Depth_Cn_Scale, ImgProc_ResizeArea,

 DEF_PARAM_TEST(Sz_Depth_Cn_Inter_Border, cv::Size, MatDepth, MatCn, Interpolation, BorderMode);

-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    ALL_BORDER_MODES)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES))
+    ALL_BORDER_MODES)
+)
+#endif
 {
    declare.time(20.0);

@@ -280,12 +324,23 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpAffine,
 //////////////////////////////////////////////////////////////////////
 // WarpPerspective

-PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    ALL_BORDER_MODES)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
    Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
-                    ALL_BORDER_MODES))
+    ALL_BORDER_MODES)
+)
+#endif
 {
    declare.time(20.0);

@@ -330,11 +385,21 @@ PERF_TEST_P(Sz_Depth_Cn_Inter_Border, ImgProc_WarpPerspective,

 DEF_PARAM_TEST(Sz_Depth_Cn_Border, cv::Size, MatDepth, MatCn, BorderMode);

-PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4,
+    ALL_BORDER_MODES)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn_Border, ImgProc_CopyMakeBorder, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
    GPU_CHANNELS_1_3_4,
-                    ALL_BORDER_MODES))
+    ALL_BORDER_MODES)
+)
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -372,10 +437,19 @@ CV_ENUM(ThreshOp, THRESH_BINARY, THRESH_BINARY_INV, THRESH_TRUNC, THRESH_TOZERO,

 DEF_PARAM_TEST(Sz_Depth_Op, cv::Size, MatDepth, ThreshOp);

-PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    ThreshOp::all()
+))
+#else
+PERF_TEST_P(Sz_Depth_Op, ImgProc_Threshold, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F, CV_64F),
-            ThreshOp::all()))
+    ThreshOp::all()
+))
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -672,10 +746,19 @@ PERF_TEST_P(Sz, ImgProc_ColumnSum,

 DEF_PARAM_TEST(Image_AppertureSz_L2gradient, string, int, bool);

-PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny,
-            Combine(Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny, Combine(
+    Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
+    Values(3),
+    Bool()
+))
+#else
+PERF_TEST_P(Image_AppertureSz_L2gradient, ImgProc_Canny, Combine(
+    Values("perf/800x600.png", "perf/1280x1024.png", "perf/1680x1050.png"),
    Values(3, 5),
-                    Bool()))
+    Bool()
+))
+#endif
 {
    const string fileName = GET_PARAM(0);
    const int apperture_size = GET_PARAM(1);
@@ -1300,10 +1383,19 @@ PERF_TEST_P(Sz_Depth_Cn_Inter, ImgProc_Rotate,
 //////////////////////////////////////////////////////////////////////
 // PyrDown

-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4)
+)
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
@@ -1336,10 +1428,19 @@ PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrDown,
 //////////////////////////////////////////////////////////////////////
 // PyrUp

-PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp,
-            Combine(GPU_TYPICAL_MAT_SIZES,
+#ifdef OPENCV_TINY_GPU_MODULE
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
+    Values(CV_8U, CV_32F),
+    GPU_CHANNELS_1_3_4)
+)
+#else
+PERF_TEST_P(Sz_Depth_Cn, ImgProc_PyrUp, Combine(
+    GPU_TYPICAL_MAT_SIZES,
    Values(CV_8U, CV_16U, CV_32F),
-                    GPU_CHANNELS_1_3_4))
+    GPU_CHANNELS_1_3_4)
+)
+#endif
 {
    const cv::Size size = GET_PARAM(0);
    const int depth = GET_PARAM(1);
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -204,6 +204,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
                             cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const caller_t callers[3][6] =
+    {
+        {
+            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
+            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
+            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
+            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
+        }
+    };
+#else
    static const caller_t callers[3][6] =
    {
        {
@@ -222,6 +242,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
+#endif

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.cols == query.cols && train.type() == query.type());
@@ -334,6 +355,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
    const GpuMat& masks, Stream& stream)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    (void)query;
+    (void)trainCollection;
+    (void)trainIdx;
+    (void)imgIdx;
+    (void)distance;
+    (void)masks;
+    (void)stream;
+    CV_Error(CV_StsNotImplemented, "not available in tiny build");
+#else
    if (query.empty() || trainCollection.empty())
        return;

@@ -374,6 +405,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    CV_Assert(func != 0);

    func(query, trainCollection, masks, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+#endif
 }

 void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, vector<DMatch>& matches)
@@ -451,6 +483,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
                             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
                             cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const caller_t callers[3][6] =
+    {
+        {
+            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
+            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
+            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
+            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
+        }
+    };
+#else
    static const caller_t callers[3][6] =
    {
        {
@@ -469,6 +521,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
+#endif

    CV_Assert(query.channels() == 1 && query.depth() < CV_64F);
    CV_Assert(train.type() == query.type() && train.cols == query.cols);
@@ -568,6 +621,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
    const GpuMat& maskCollection, Stream& stream)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    (void)query;
+    (void)trainCollection;
+    (void)trainIdx;
+    (void)imgIdx;
+    (void)distance;
+    (void)maskCollection;
+    (void)stream;
+    CV_Error(CV_StsNotImplemented, "not available in tiny build");
+#else
    if (query.empty() || trainCollection.empty())
        return;

@@ -613,6 +676,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    CV_Assert(func != 0);

    func(query, trainCollection, maskCollection, trainIdx, imgIdx, distance, StreamAccessor::getStream(stream));
+#endif
 }

 void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
@@ -755,6 +819,26 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
                             cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const caller_t callers[3][6] =
+    {
+        {
+            0/*matchL1_gpu<unsigned char>*/, 0/*matchL1_gpu<signed char>*/,
+            0/*matchL1_gpu<unsigned short>*/, 0/*matchL1_gpu<short>*/,
+            0/*matchL1_gpu<int>*/, 0/*matchL1_gpu<float>*/
+        },
+        {
+            0/*matchL2_gpu<unsigned char>*/, 0/*matchL2_gpu<signed char>*/,
+            0/*matchL2_gpu<unsigned short>*/, 0/*matchL2_gpu<short>*/,
+            0/*matchL2_gpu<int>*/, matchL2_gpu<float>
+        },
+        {
+            matchHamming_gpu<unsigned char>, 0/*matchHamming_gpu<signed char>*/,
+            0/*matchHamming_gpu<unsigned short>*/, 0/*matchHamming_gpu<short>*/,
+            0/*matchHamming_gpu<int>*/, 0/*matchHamming_gpu<float>*/
+        }
+    };
+#else
    static const caller_t callers[3][6] =
    {
        {
@@ -773,6 +857,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
            matchHamming_gpu<int>, 0/*matchHamming_gpu<float>*/
        }
    };
+#endif

    const int nQuery = query.rows;
    const int nTrain = train.rows;
@@ -872,6 +957,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& query, const
 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches,
    float maxDistance, const vector<GpuMat>& masks, Stream& stream)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    (void)query;
+    (void)trainIdx;
+    (void)imgIdx;
+    (void)distance;
+    (void)nMatches;
+    (void)maxDistance;
+    (void)masks;
+    (void)stream;
+    CV_Error(CV_StsNotImplemented, "not available in tiny build");
+#else
    if (query.empty() || empty())
        return;

@@ -926,6 +1022,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu

    func(query, &trains_[0], static_cast<int>(trains_.size()), maxDistance, masks_.size() == 0 ? 0 : &masks_[0],
        trainIdx, imgIdx, distance, nMatches, StreamAccessor::getStream(stream));
+#endif
 }

 void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -71,6 +71,12 @@ namespace cv { namespace gpu {

 using namespace ::cv::gpu::device;

+#ifdef OPENCV_TINY_GPU_MODULE
+    #define APPEND_16U(func) 0
+#else
+    #define APPEND_16U(func) func ## _16u
+#endif
+
 namespace
 {
    typedef void (*gpu_func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -78,10 +84,11 @@ namespace
    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, bgr_to_rgb_16u, 0, 0, bgr_to_rgb_32f};
+        static const gpu_func_t funcs[] = {bgr_to_rgb_8u, 0, APPEND_16U(bgr_to_rgb), 0, 0, bgr_to_rgb_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));

@@ -91,10 +98,11 @@ namespace
    void bgr_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, bgr_to_bgra_16u, 0, 0, bgr_to_bgra_32f};
+        static const gpu_func_t funcs[] = {bgr_to_bgra_8u, 0, APPEND_16U(bgr_to_bgra), 0, 0, bgr_to_bgra_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));

@@ -104,10 +112,11 @@ namespace
    void bgr_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, bgr_to_rgba_16u, 0, 0, bgr_to_rgba_32f};
+        static const gpu_func_t funcs[] = {bgr_to_rgba_8u, 0, APPEND_16U(bgr_to_rgba), 0, 0, bgr_to_rgba_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));

@@ -117,10 +126,11 @@ namespace
    void bgra_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, bgra_to_bgr_16u, 0, 0, bgra_to_bgr_32f};
+        static const gpu_func_t funcs[] = {bgra_to_bgr_8u, 0, APPEND_16U(bgra_to_bgr), 0, 0, bgra_to_bgr_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));

@@ -130,10 +140,11 @@ namespace
    void bgra_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, bgra_to_rgb_16u, 0, 0, bgra_to_rgb_32f};
+        static const gpu_func_t funcs[] = {bgra_to_rgb_8u, 0, APPEND_16U(bgra_to_rgb), 0, 0, bgra_to_rgb_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));

@@ -143,10 +154,11 @@ namespace
    void bgra_to_rgba(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, bgra_to_rgba_16u, 0, 0, bgra_to_rgba_32f};
+        static const gpu_func_t funcs[] = {bgra_to_rgba_8u, 0, APPEND_16U(bgra_to_rgba), 0, 0, bgra_to_rgba_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));

@@ -316,10 +328,11 @@ namespace
    void gray_to_bgr(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, gray_to_bgr_16u, 0, 0, gray_to_bgr_32f};
+        static const gpu_func_t funcs[] = {gray_to_bgr_8u, 0, APPEND_16U(gray_to_bgr), 0, 0, gray_to_bgr_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 1);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 3));

@@ -329,10 +342,11 @@ namespace
    void gray_to_bgra(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, gray_to_bgra_16u, 0, 0, gray_to_bgra_32f};
+        static const gpu_func_t funcs[] = {gray_to_bgra_8u, 0, APPEND_16U(gray_to_bgra), 0, 0, gray_to_bgra_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 1);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 4));

@@ -382,10 +396,11 @@ namespace
    void rgb_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, rgb_to_gray_16u, 0, 0, rgb_to_gray_32f};
+        static const gpu_func_t funcs[] = {rgb_to_gray_8u, 0, APPEND_16U(rgb_to_gray), 0, 0, rgb_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));

@@ -395,10 +410,11 @@ namespace
    void bgr_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, bgr_to_gray_16u, 0, 0, bgr_to_gray_32f};
+        static const gpu_func_t funcs[] = {bgr_to_gray_8u, 0, APPEND_16U(bgr_to_gray), 0, 0, bgr_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));

@@ -408,10 +424,11 @@ namespace
    void rgba_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, rgba_to_gray_16u, 0, 0, rgba_to_gray_32f};
+        static const gpu_func_t funcs[] = {rgba_to_gray_8u, 0, APPEND_16U(rgba_to_gray), 0, 0, rgba_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));

@@ -421,10 +438,11 @@ namespace
    void bgra_to_gray(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
        using namespace cv::gpu::device;
-        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, bgra_to_gray_16u, 0, 0, bgra_to_gray_32f};
+        static const gpu_func_t funcs[] = {bgra_to_gray_8u, 0, APPEND_16U(bgra_to_gray), 0, 0, bgra_to_gray_32f};

        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 4);
+        CV_Assert(funcs[src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), 1));

@@ -437,12 +455,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {rgb_to_yuv_8u, 0, rgb_to_yuv_16u, 0, 0, rgb_to_yuv_32f},
-                {rgba_to_yuv_8u, 0, rgba_to_yuv_16u, 0, 0, rgba_to_yuv_32f}
+                {rgb_to_yuv_8u, 0, APPEND_16U(rgb_to_yuv), 0, 0, rgb_to_yuv_32f},
+                {rgba_to_yuv_8u, 0, APPEND_16U(rgba_to_yuv), 0, 0, rgba_to_yuv_32f}
            },
            {
-                {rgb_to_yuv4_8u, 0, rgb_to_yuv4_16u, 0, 0, rgb_to_yuv4_32f},
-                {rgba_to_yuv4_8u, 0, rgba_to_yuv4_16u, 0, 0, rgba_to_yuv4_32f}
+                {rgb_to_yuv4_8u, 0, APPEND_16U(rgb_to_yuv4), 0, 0, rgb_to_yuv4_32f},
+                {rgba_to_yuv4_8u, 0, APPEND_16U(rgba_to_yuv4), 0, 0, rgba_to_yuv4_32f}
            }
        };

@@ -451,6 +469,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -463,12 +482,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {bgr_to_yuv_8u, 0, bgr_to_yuv_16u, 0, 0, bgr_to_yuv_32f},
-                {bgra_to_yuv_8u, 0, bgra_to_yuv_16u, 0, 0, bgra_to_yuv_32f}
+                {bgr_to_yuv_8u, 0, APPEND_16U(bgr_to_yuv), 0, 0, bgr_to_yuv_32f},
+                {bgra_to_yuv_8u, 0, APPEND_16U(bgra_to_yuv), 0, 0, bgra_to_yuv_32f}
            },
            {
-                {bgr_to_yuv4_8u, 0, bgr_to_yuv4_16u, 0, 0, bgr_to_yuv4_32f},
-                {bgra_to_yuv4_8u, 0, bgra_to_yuv4_16u, 0, 0, bgra_to_yuv4_32f}
+                {bgr_to_yuv4_8u, 0, APPEND_16U(bgr_to_yuv4), 0, 0, bgr_to_yuv4_32f},
+                {bgra_to_yuv4_8u, 0, APPEND_16U(bgra_to_yuv4), 0, 0, bgra_to_yuv4_32f}
            }
        };

@@ -477,6 +496,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -489,12 +509,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {yuv_to_rgb_8u, 0, yuv_to_rgb_16u, 0, 0, yuv_to_rgb_32f},
-                {yuv4_to_rgb_8u, 0, yuv4_to_rgb_16u, 0, 0, yuv4_to_rgb_32f}
+                {yuv_to_rgb_8u, 0, APPEND_16U(yuv_to_rgb), 0, 0, yuv_to_rgb_32f},
+                {yuv4_to_rgb_8u, 0, APPEND_16U(yuv4_to_rgb), 0, 0, yuv4_to_rgb_32f}
            },
            {
-                {yuv_to_rgba_8u, 0, yuv_to_rgba_16u, 0, 0, yuv_to_rgba_32f},
-                {yuv4_to_rgba_8u, 0, yuv4_to_rgba_16u, 0, 0, yuv4_to_rgba_32f}
+                {yuv_to_rgba_8u, 0, APPEND_16U(yuv_to_rgba), 0, 0, yuv_to_rgba_32f},
+                {yuv4_to_rgba_8u, 0, APPEND_16U(yuv4_to_rgba), 0, 0, yuv4_to_rgba_32f}
            }
        };

@@ -503,6 +523,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -515,12 +536,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {yuv_to_bgr_8u, 0, yuv_to_bgr_16u, 0, 0, yuv_to_bgr_32f},
-                {yuv4_to_bgr_8u, 0, yuv4_to_bgr_16u, 0, 0, yuv4_to_bgr_32f}
+                {yuv_to_bgr_8u, 0, APPEND_16U(yuv_to_bgr), 0, 0, yuv_to_bgr_32f},
+                {yuv4_to_bgr_8u, 0, APPEND_16U(yuv4_to_bgr), 0, 0, yuv4_to_bgr_32f}
            },
            {
-                {yuv_to_bgra_8u, 0, yuv_to_bgra_16u, 0, 0, yuv_to_bgra_32f},
-                {yuv4_to_bgra_8u, 0, yuv4_to_bgra_16u, 0, 0, yuv4_to_bgra_32f}
+                {yuv_to_bgra_8u, 0, APPEND_16U(yuv_to_bgra), 0, 0, yuv_to_bgra_32f},
+                {yuv4_to_bgra_8u, 0, APPEND_16U(yuv4_to_bgra), 0, 0, yuv4_to_bgra_32f}
            }
        };

@@ -529,6 +550,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -541,12 +563,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {rgb_to_YCrCb_8u, 0, rgb_to_YCrCb_16u, 0, 0, rgb_to_YCrCb_32f},
-                {rgba_to_YCrCb_8u, 0, rgba_to_YCrCb_16u, 0, 0, rgba_to_YCrCb_32f}
+                {rgb_to_YCrCb_8u, 0, APPEND_16U(rgb_to_YCrCb), 0, 0, rgb_to_YCrCb_32f},
+                {rgba_to_YCrCb_8u, 0, APPEND_16U(rgba_to_YCrCb), 0, 0, rgba_to_YCrCb_32f}
            },
            {
-                {rgb_to_YCrCb4_8u, 0, rgb_to_YCrCb4_16u, 0, 0, rgb_to_YCrCb4_32f},
-                {rgba_to_YCrCb4_8u, 0, rgba_to_YCrCb4_16u, 0, 0, rgba_to_YCrCb4_32f}
+                {rgb_to_YCrCb4_8u, 0, APPEND_16U(rgb_to_YCrCb4), 0, 0, rgb_to_YCrCb4_32f},
+                {rgba_to_YCrCb4_8u, 0, APPEND_16U(rgba_to_YCrCb4), 0, 0, rgba_to_YCrCb4_32f}
            }
        };

@@ -555,6 +577,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -567,12 +590,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {bgr_to_YCrCb_8u, 0, bgr_to_YCrCb_16u, 0, 0, bgr_to_YCrCb_32f},
-                {bgra_to_YCrCb_8u, 0, bgra_to_YCrCb_16u, 0, 0, bgra_to_YCrCb_32f}
+                {bgr_to_YCrCb_8u, 0, APPEND_16U(bgr_to_YCrCb), 0, 0, bgr_to_YCrCb_32f},
+                {bgra_to_YCrCb_8u, 0, APPEND_16U(bgra_to_YCrCb), 0, 0, bgra_to_YCrCb_32f}
            },
            {
-                {bgr_to_YCrCb4_8u, 0, bgr_to_YCrCb4_16u, 0, 0, bgr_to_YCrCb4_32f},
-                {bgra_to_YCrCb4_8u, 0, bgra_to_YCrCb4_16u, 0, 0, bgra_to_YCrCb4_32f}
+                {bgr_to_YCrCb4_8u, 0, APPEND_16U(bgr_to_YCrCb4), 0, 0, bgr_to_YCrCb4_32f},
+                {bgra_to_YCrCb4_8u, 0, APPEND_16U(bgra_to_YCrCb4), 0, 0, bgra_to_YCrCb4_32f}
            }
        };

@@ -581,6 +604,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -593,12 +617,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {YCrCb_to_rgb_8u, 0, YCrCb_to_rgb_16u, 0, 0, YCrCb_to_rgb_32f},
-                {YCrCb4_to_rgb_8u, 0, YCrCb4_to_rgb_16u, 0, 0, YCrCb4_to_rgb_32f}
+                {YCrCb_to_rgb_8u, 0, APPEND_16U(YCrCb_to_rgb), 0, 0, YCrCb_to_rgb_32f},
+                {YCrCb4_to_rgb_8u, 0, APPEND_16U(YCrCb4_to_rgb), 0, 0, YCrCb4_to_rgb_32f}
            },
            {
-                {YCrCb_to_rgba_8u, 0, YCrCb_to_rgba_16u, 0, 0, YCrCb_to_rgba_32f},
-                {YCrCb4_to_rgba_8u, 0, YCrCb4_to_rgba_16u, 0, 0, YCrCb4_to_rgba_32f}
+                {YCrCb_to_rgba_8u, 0, APPEND_16U(YCrCb_to_rgba), 0, 0, YCrCb_to_rgba_32f},
+                {YCrCb4_to_rgba_8u, 0, APPEND_16U(YCrCb4_to_rgba), 0, 0, YCrCb4_to_rgba_32f}
            }
        };

@@ -607,6 +631,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -619,12 +644,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {YCrCb_to_bgr_8u, 0, YCrCb_to_bgr_16u, 0, 0, YCrCb_to_bgr_32f},
-                {YCrCb4_to_bgr_8u, 0, YCrCb4_to_bgr_16u, 0, 0, YCrCb4_to_bgr_32f}
+                {YCrCb_to_bgr_8u, 0, APPEND_16U(YCrCb_to_bgr), 0, 0, YCrCb_to_bgr_32f},
+                {YCrCb4_to_bgr_8u, 0, APPEND_16U(YCrCb4_to_bgr), 0, 0, YCrCb4_to_bgr_32f}
            },
            {
-                {YCrCb_to_bgra_8u, 0, YCrCb_to_bgra_16u, 0, 0, YCrCb_to_bgra_32f},
-                {YCrCb4_to_bgra_8u, 0, YCrCb4_to_bgra_16u, 0, 0, YCrCb4_to_bgra_32f}
+                {YCrCb_to_bgra_8u, 0, APPEND_16U(YCrCb_to_bgra), 0, 0, YCrCb_to_bgra_32f},
+                {YCrCb4_to_bgra_8u, 0, APPEND_16U(YCrCb4_to_bgra), 0, 0, YCrCb4_to_bgra_32f}
            }
        };

@@ -633,6 +658,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -645,12 +671,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {rgb_to_xyz_8u, 0, rgb_to_xyz_16u, 0, 0, rgb_to_xyz_32f},
-                {rgba_to_xyz_8u, 0, rgba_to_xyz_16u, 0, 0, rgba_to_xyz_32f}
+                {rgb_to_xyz_8u, 0, APPEND_16U(rgb_to_xyz), 0, 0, rgb_to_xyz_32f},
+                {rgba_to_xyz_8u, 0, APPEND_16U(rgba_to_xyz), 0, 0, rgba_to_xyz_32f}
            },
            {
-                {rgb_to_xyz4_8u, 0, rgb_to_xyz4_16u, 0, 0, rgb_to_xyz4_32f},
-                {rgba_to_xyz4_8u, 0, rgba_to_xyz4_16u, 0, 0, rgba_to_xyz4_32f}
+                {rgb_to_xyz4_8u, 0, APPEND_16U(rgb_to_xyz4), 0, 0, rgb_to_xyz4_32f},
+                {rgba_to_xyz4_8u, 0, APPEND_16U(rgba_to_xyz4), 0, 0, rgba_to_xyz4_32f}
            }
        };

@@ -659,6 +685,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -671,12 +698,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {bgr_to_xyz_8u, 0, bgr_to_xyz_16u, 0, 0, bgr_to_xyz_32f},
-                {bgra_to_xyz_8u, 0, bgra_to_xyz_16u, 0, 0, bgra_to_xyz_32f}
+                {bgr_to_xyz_8u, 0, APPEND_16U(bgr_to_xyz), 0, 0, bgr_to_xyz_32f},
+                {bgra_to_xyz_8u, 0, APPEND_16U(bgra_to_xyz), 0, 0, bgra_to_xyz_32f}
            },
            {
-                {bgr_to_xyz4_8u, 0, bgr_to_xyz4_16u, 0, 0, bgr_to_xyz4_32f},
-                {bgra_to_xyz4_8u, 0, bgra_to_xyz4_16u, 0, 0, bgra_to_xyz4_32f}
+                {bgr_to_xyz4_8u, 0, APPEND_16U(bgr_to_xyz4), 0, 0, bgr_to_xyz4_32f},
+                {bgra_to_xyz4_8u, 0, APPEND_16U(bgra_to_xyz4), 0, 0, bgra_to_xyz4_32f}
            }
        };

@@ -685,6 +712,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -697,12 +725,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {xyz_to_rgb_8u, 0, xyz_to_rgb_16u, 0, 0, xyz_to_rgb_32f},
-                {xyz4_to_rgb_8u, 0, xyz4_to_rgb_16u, 0, 0, xyz4_to_rgb_32f}
+                {xyz_to_rgb_8u, 0, APPEND_16U(xyz_to_rgb), 0, 0, xyz_to_rgb_32f},
+                {xyz4_to_rgb_8u, 0, APPEND_16U(xyz4_to_rgb), 0, 0, xyz4_to_rgb_32f}
            },
            {
-                {xyz_to_rgba_8u, 0, xyz_to_rgba_16u, 0, 0, xyz_to_rgba_32f},
-                {xyz4_to_rgba_8u, 0, xyz4_to_rgba_16u, 0, 0, xyz4_to_rgba_32f}
+                {xyz_to_rgba_8u, 0, APPEND_16U(xyz_to_rgba), 0, 0, xyz_to_rgba_32f},
+                {xyz4_to_rgba_8u, 0, APPEND_16U(xyz4_to_rgba), 0, 0, xyz4_to_rgba_32f}
            }
        };

@@ -711,6 +739,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

@@ -723,12 +752,12 @@ namespace
        static const gpu_func_t funcs[2][2][6] =
        {
            {
-                {xyz_to_bgr_8u, 0, xyz_to_bgr_16u, 0, 0, xyz_to_bgr_32f},
-                {xyz4_to_bgr_8u, 0, xyz4_to_bgr_16u, 0, 0, xyz4_to_bgr_32f}
+                {xyz_to_bgr_8u, 0, APPEND_16U(xyz_to_bgr), 0, 0, xyz_to_bgr_32f},
+                {xyz4_to_bgr_8u, 0, APPEND_16U(xyz4_to_bgr), 0, 0, xyz4_to_bgr_32f}
            },
            {
-                {xyz_to_bgra_8u, 0, xyz_to_bgra_16u, 0, 0, xyz_to_bgra_32f},
-                {xyz4_to_bgra_8u, 0, xyz4_to_bgra_16u, 0, 0, xyz4_to_bgra_32f}
+                {xyz_to_bgra_8u, 0, APPEND_16U(xyz_to_bgra), 0, 0, xyz_to_bgra_32f},
+                {xyz4_to_bgra_8u, 0, APPEND_16U(xyz4_to_bgra), 0, 0, xyz4_to_bgra_32f}
            }
        };

@@ -737,6 +766,7 @@ namespace
        CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
        CV_Assert(src.channels() == 3 || src.channels() == 4);
        CV_Assert(dcn == 3 || dcn == 4);
+        CV_Assert(funcs[dcn == 4][src.channels() == 4][src.depth()] != 0);

        dst.create(src.size(), CV_MAKETYPE(src.depth(), dcn));

--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -1168,12 +1168,14 @@ namespace cv { namespace gpu { namespace device
                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#endif

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
            const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
@@ -1185,11 +1187,13 @@ namespace cv { namespace gpu { namespace device
                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#endif
        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
@@ -1203,10 +1207,12 @@ namespace cv { namespace gpu { namespace device
        }

        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
+#endif

        template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@@ -1218,12 +1224,14 @@ namespace cv { namespace gpu { namespace device
                match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+#endif

        template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@@ -1235,12 +1243,14 @@ namespace cv { namespace gpu { namespace device
                match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+#endif

        template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
            const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
@@ -1252,11 +1262,13 @@ namespace cv { namespace gpu { namespace device
                match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
        template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
+#endif
    } // namespace bf_knnmatch
 }}} // namespace cv { namespace gpu { namespace device {

--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -644,12 +644,14 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
@@ -669,11 +671,13 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
@@ -695,10 +699,12 @@ namespace cv { namespace gpu { namespace device
        }

        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@@ -718,12 +724,14 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@@ -743,12 +751,14 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
@@ -768,11 +778,13 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+#endif
    } // namespace bf_match
 }}} // namespace cv { namespace gpu { namespace device {

--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -356,12 +356,14 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -381,11 +383,13 @@ namespace cv { namespace gpu { namespace device
            }
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
@@ -407,10 +411,12 @@ namespace cv { namespace gpu { namespace device
        }

        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif

        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -421,12 +427,14 @@ namespace cv { namespace gpu { namespace device
                stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif

        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -437,12 +445,14 @@ namespace cv { namespace gpu { namespace device
                stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif

        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
@@ -453,11 +463,13 @@ namespace cv { namespace gpu { namespace device
                stream);
        }

+#ifndef OPENCV_TINY_GPU_MODULE
        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+#endif
    } // namespace bf_radius_match
 }}} // namespace cv { namespace gpu { namespace device

--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -149,6 +149,16 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdReflect101>,
+                bilateral_caller<T, BrdReplicate>,
+                0,
+                0,
+                0,
+            };
+#else
            static caller_t funcs[] =
            {
                bilateral_caller<T, BrdReflect101>,
@@ -157,7 +167,13 @@ namespace cv { namespace gpu { namespace device
                bilateral_caller<T, BrdReflect>,
                bilateral_caller<T, BrdWrap>,
            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+#endif
+
+            const caller_t caller = funcs[borderMode];
+            if (!caller)
+                cv::gpu::error("Unsupported input parameters for bilateral_filter", __FILE__, __LINE__, "");
+
+            caller(src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
        }
    }
 }}}
@@ -171,6 +187,7 @@ OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
 OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
 OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)

+#ifndef OPENCV_TINY_GPU_MODULE
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar)
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
 //OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
@@ -190,6 +207,7 @@ OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int2)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int3)
 //OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+#endif

 OCV_INSTANTIATE_BILATERAL_FILTER(float)
 //OCV_INSTANTIATE_BILATERAL_FILTER(float2)
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -235,10 +235,16 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)

-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+#else
+    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+#endif

 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@@ -44,9 +44,13 @@

 #include "column_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@@ -183,6 +183,186 @@ namespace filter
    {
        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColReflect101>,
+                0,
+                column_filter::caller< 5, T, D, BrdColReflect101>,
+                0,
+                column_filter::caller< 7, T, D, BrdColReflect101>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColReplicate>,
+                0,
+                column_filter::caller< 5, T, D, BrdColReplicate>,
+                0,
+                column_filter::caller< 7, T, D, BrdColReplicate>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColConstant>,
+                0,
+                column_filter::caller< 5, T, D, BrdColConstant>,
+                0,
+                column_filter::caller< 7, T, D, BrdColConstant>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                column_filter::caller< 3, T, D, BrdColReflect>,
+                0,
+                column_filter::caller< 5, T, D, BrdColReflect>,
+                0,
+                column_filter::caller< 7, T, D, BrdColReflect>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0
+            }
+        };
+#else
        static const caller_t callers[5][33] =
        {
            {
@@ -361,12 +541,17 @@ namespace filter
                column_filter::caller<32, T, D, BrdColWrap>
            }
        };
+#endif
+
+        const caller_t caller = callers[brd_type][ksize];
+        if (!caller)
+            cv::gpu::error("Unsupported input parameters for column_filter", __FILE__, __LINE__, "");

        if (stream == 0)
            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
        else
            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );

-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        caller((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
    }
 }
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -90,10 +90,18 @@ namespace cv { namespace gpu { namespace device
                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+    #ifdef OPENCV_TINY_GPU_MODULE
+                0,
+    #else
                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
+    #endif
            };

-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+            const caller_t caller = callers[borderMode];
+            if (!caller)
+                cv::gpu::error("Unsupported input parameters for copyMakeBorder", __FILE__, __LINE__, "");
+
+            caller(PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
        }

        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
@@ -101,6 +109,7 @@ namespace cv { namespace gpu { namespace device
        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
@@ -120,6 +129,7 @@ namespace cv { namespace gpu { namespace device
        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+#endif

        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -234,6 +234,7 @@ namespace arithm
    }

    template void addMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -278,7 +279,9 @@ namespace arithm
    //template void addMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
    template void addMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

    //template void addMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -288,6 +291,7 @@ namespace arithm
    //template void addMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -329,6 +333,7 @@ namespace arithm
    }

    template void addScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -373,7 +378,9 @@ namespace arithm
    //template void addScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
    template void addScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

    //template void addScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -383,6 +390,7 @@ namespace arithm
    //template void addScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void addScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void addScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -469,6 +477,7 @@ namespace arithm
    }

    template void subMat<uchar, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void subMat<uchar, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subMat<uchar, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subMat<uchar, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -513,7 +522,9 @@ namespace arithm
    //template void subMat<float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
    template void subMat<float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void subMat<float, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

    //template void subMat<double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -523,6 +534,7 @@ namespace arithm
    //template void subMat<double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subMat<double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subMat<double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -542,6 +554,7 @@ namespace arithm
    }

    template void subScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void subScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -586,7 +599,9 @@ namespace arithm
    //template void subScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
    template void subScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void subScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);

    //template void subScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
@@ -596,6 +611,7 @@ namespace arithm
    //template void subScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    //template void subScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
    template void subScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, PtrStepb mask, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -700,6 +716,7 @@ namespace arithm
    }

    template void mulMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void mulMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void mulMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void mulMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -744,7 +761,9 @@ namespace arithm
    //template void mulMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
    template void mulMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void mulMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);

    //template void mulMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -754,6 +773,7 @@ namespace arithm
    //template void mulMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void mulMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void mulMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -791,6 +811,7 @@ namespace arithm
    }

    template void mulScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void mulScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void mulScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void mulScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -835,7 +856,9 @@ namespace arithm
    //template void mulScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void mulScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void mulScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);

    //template void mulScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -845,6 +868,7 @@ namespace arithm
    //template void mulScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void mulScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void mulScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -968,6 +992,7 @@ namespace arithm
    }

    template void divMat<uchar, float, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void divMat<uchar, float, schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void divMat<uchar, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void divMat<uchar, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -1012,7 +1037,9 @@ namespace arithm
    //template void divMat<float, float, ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<float, float, short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<float, float, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
    template void divMat<float, float, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void divMat<float, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);

    //template void divMat<double, double, uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
@@ -1022,6 +1049,7 @@ namespace arithm
    //template void divMat<double, double, int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    //template void divMat<double, double, float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
    template void divMat<double, double, double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, double scale, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1037,6 +1065,7 @@ namespace arithm
    }

    template void divScalar<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void divScalar<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divScalar<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divScalar<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1081,7 +1110,9 @@ namespace arithm
    //template void divScalar<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void divScalar<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void divScalar<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);

    //template void divScalar<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1091,6 +1122,7 @@ namespace arithm
    //template void divScalar<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divScalar<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divScalar<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1128,6 +1160,7 @@ namespace arithm
    }

    template void divInv<uchar, float, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void divInv<uchar, float, schar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divInv<uchar, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divInv<uchar, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1172,7 +1205,9 @@ namespace arithm
    //template void divInv<float, float, ushort>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<float, float, short>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<float, float, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void divInv<float, float, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void divInv<float, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);

    //template void divInv<double, double, uchar>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
@@ -1182,6 +1217,7 @@ namespace arithm
    //template void divInv<double, double, int>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    //template void divInv<double, double, float>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
    template void divInv<double, double, double>(PtrStepSzb src1, double val, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1278,12 +1314,16 @@ namespace arithm
    }

    template void absDiffMat<uchar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffMat<schar>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffMat<short>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffMat<int>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void absDiffMat<float>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1323,12 +1363,16 @@ namespace arithm
    }

    template void absDiffScalar<uchar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffScalar<schar, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffScalar<ushort, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffScalar<short, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void absDiffScalar<int, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void absDiffScalar<float, float>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void absDiffScalar<double, double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1349,13 +1393,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, abs_func<T>(), WithOutMask(), stream);
    }

+#ifndef OPENCV_TINY_GPU_MODULE
    template void absMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void absMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void absMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void absMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1390,13 +1438,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Sqr<T>(), WithOutMask(), stream);
    }

+#ifndef OPENCV_TINY_GPU_MODULE
    template void sqrMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void sqrMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void sqrMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1417,13 +1469,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, sqrt_func<T>(), WithOutMask(), stream);
    }

+#ifndef OPENCV_TINY_GPU_MODULE
    template void sqrtMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void sqrtMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void sqrtMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void sqrtMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1444,13 +1500,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, log_func<T>(), WithOutMask(), stream);
    }

+#ifndef OPENCV_TINY_GPU_MODULE
    template void logMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void logMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void logMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void logMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -1486,13 +1546,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, Exp<T>(), WithOutMask(), stream);
    }

+#ifndef OPENCV_TINY_GPU_MODULE
    template void expMat<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    template void expMat<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void expMat<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void expMat<double>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////////////////
@@ -1620,36 +1684,52 @@ namespace arithm
    }

    template void cmpMatEq<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatEq<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatEq<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatEq<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatEq<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpMatEq<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatEq<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpMatNe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatNe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatNe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatNe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatNe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpMatNe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatNe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpMatLt<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLt<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLt<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLt<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLt<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpMatLt<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLt<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpMatLe<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLe<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLe<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLe<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void cmpMatLe<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpMatLe<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpMatLe<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////////////////
@@ -1824,52 +1904,76 @@ namespace arithm
    }

    template void cmpScalarEq<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarEq<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarEq<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarEq<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarEq<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpScalarEq<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarEq<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpScalarNe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarNe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarNe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarNe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarNe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpScalarNe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarNe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpScalarLt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpScalarLt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpScalarLe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarLe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpScalarLe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarLe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpScalarGt<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGt<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGt<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGt<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGt<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpScalarGt<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGt<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void cmpScalarGe<uchar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGe<schar >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGe<ushort>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGe<short >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
    template void cmpScalarGe<int   >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void cmpScalarGe<float >(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void cmpScalarGe<double>(PtrStepSzb src, int cn, double val[4], PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////////////////
@@ -1981,19 +2085,25 @@ namespace arithm
    }

    template void bitScalarAnd<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void bitScalarAnd<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarAnd<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void bitScalarOr<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void bitScalarOr<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarOr<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template void bitScalarXor<uchar>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void bitScalarXor<ushort>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
    template void bitScalarXor<unsigned int>(PtrStepSzb src1, uint src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -2067,18 +2177,27 @@ namespace arithm
    }

    template void minMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void minMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void minMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void minMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void minMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template <typename T> void minScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
    {
        transform((PtrStepSz<T>) src1, (PtrStepSz<T>) dst, cv::gpu::device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
    }

+#ifdef OPENCV_TINY_GPU_MODULE
+    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#else
    template void minScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
@@ -2086,6 +2205,7 @@ namespace arithm
    template void minScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void minScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -2159,12 +2279,16 @@ namespace arithm
    }

    template void maxMat<uchar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void maxMat<schar >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<ushort>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<short >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxMat<int   >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void maxMat<float >(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void maxMat<double>(PtrStepSzb src1, PtrStepSzb src2, PtrStepSzb dst, cudaStream_t stream);
+#endif

    template <typename T> void maxScalar(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream)
    {
@@ -2172,12 +2296,16 @@ namespace arithm
    }

    template void maxScalar<uchar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void maxScalar<schar >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxScalar<ushort>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxScalar<short >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
    template void maxScalar<int   >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void maxScalar<float >(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void maxScalar<double>(PtrStepSzb src1, double src2, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -2233,12 +2361,16 @@ namespace arithm
    }

    template void threshold<uchar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void threshold<schar>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
    template void threshold<ushort>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
    template void threshold<short>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
    template void threshold<int>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#endif
    template void threshold<float>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void threshold<double>(PtrStepSzb src, PtrStepSzb dst, double thresh, double maxVal, int type, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -2312,13 +2444,17 @@ namespace arithm
        transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, PowOp<T>(power), WithOutMask(), stream);
    }

+#ifndef OPENCV_TINY_GPU_MODULE
    template void pow<uchar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<schar>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<short>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<ushort>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
    template void pow<int>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void pow<float>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void pow<double>(PtrStepSzb src, double power, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////
@@ -2397,6 +2533,7 @@ namespace arithm
    }

    template void addWeighted<uchar, uchar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<uchar, uchar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, uchar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, uchar, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2451,9 +2588,10 @@ namespace arithm
    template void addWeighted<uchar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<uchar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif


-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<schar, schar, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, schar, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, schar, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2501,9 +2639,10 @@ namespace arithm
    template void addWeighted<schar, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<schar, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif


-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<ushort, ushort, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, ushort, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, ushort, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2543,9 +2682,10 @@ namespace arithm
    template void addWeighted<ushort, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<ushort, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif


-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<short, short, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, short, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, short, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2577,9 +2717,10 @@ namespace arithm
    template void addWeighted<short, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<short, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif


-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<int, int, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, int, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, int, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2603,15 +2744,18 @@ namespace arithm
    template void addWeighted<int, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<int, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif


-
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<float, float, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, short>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, float, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
    template void addWeighted<float, float, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<float, float, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);

    template void addWeighted<float, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2621,9 +2765,11 @@ namespace arithm
    template void addWeighted<float, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<float, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif



+#ifndef OPENCV_TINY_GPU_MODULE
    template void addWeighted<double, double, uchar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, schar>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, ushort>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
@@ -2631,6 +2777,7 @@ namespace arithm
    template void addWeighted<double, double, int>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, float>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
    template void addWeighted<double, double, double>(PtrStepSzb src1, double alpha, PtrStepSzb src2, double beta, double gamma, PtrStepSzb dst, cudaStream_t stream);
+#endif
 }

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -985,6 +985,16 @@ namespace cv { namespace gpu { namespace device
                          int borderMode, const float* borderValue, cudaStream_t stream)
        {
            typedef void (*func_t)(const PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<D> dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* borderValue, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+            static const func_t funcs[] =
+            {
+                Filter2DCaller<T, D, BrdReflect101>::call,
+                Filter2DCaller<T, D, BrdReplicate>::call,
+                Filter2DCaller<T, D, BrdConstant>::call,
+                Filter2DCaller<T, D, BrdReflect>::call,
+                0
+            };
+#else
            static const func_t funcs[] =
            {
                Filter2DCaller<T, D, BrdReflect101>::call,
@@ -993,19 +1003,26 @@ namespace cv { namespace gpu { namespace device
                Filter2DCaller<T, D, BrdReflect>::call,
                Filter2DCaller<T, D, BrdWrap>::call
            };
+#endif
+
+            const func_t func = funcs[borderMode];
+            if (!func)
+                cv::gpu::error("Unsupported input parameters for filter2D", __FILE__, __LINE__, "");

            if (stream == 0)
                cudaSafeCall( cudaMemcpyToSymbol(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
            else
                cudaSafeCall( cudaMemcpyToSymbolAsync(c_filter2DKernel, kernel, kWidth * kHeight * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );

-            funcs[borderMode](static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
+            func(static_cast< PtrStepSz<T> >(srcWhole), ofsX, ofsY, static_cast< PtrStepSz<D> >(dst), kWidth, kHeight, anchorX, anchorY, borderValue, stream);
        }

        template void filter2D_gpu<uchar, uchar>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<uchar4, uchar4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+#ifndef OPENCV_TINY_GPU_MODULE
        template void filter2D_gpu<ushort, ushort>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<ushort4, ushort4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
+#endif
        template void filter2D_gpu<float, float>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
        template void filter2D_gpu<float4, float4>(PtrStepSzb srcWhole, int ofsX, int ofsY, PtrStepSzb dst, int kWidth, int kHeight, int anchorX, int anchorY, const float* kernel, int borderMode, const float* borderValue, cudaStream_t stream);
    } // namespace imgproc
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -462,6 +462,7 @@ namespace sum
    }

    template void run<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void run<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -485,8 +486,10 @@ namespace sum
    template void run<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif

    template void run<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void run<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -495,6 +498,7 @@ namespace sum
    template void run<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void run<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif

    template <typename T, int cn>
    void runAbs(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask)
@@ -504,6 +508,7 @@ namespace sum
    }

    template void runAbs<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void runAbs<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -527,8 +532,10 @@ namespace sum
    template void runAbs<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif

    template void runAbs<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void runAbs<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -537,6 +544,7 @@ namespace sum
    template void runAbs<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runAbs<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif

    template <typename T> struct Sqr : unary_function<T, T>
    {
@@ -553,6 +561,7 @@ namespace sum
    }

    template void runSqr<uchar, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void runSqr<uchar, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<uchar, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<uchar, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -576,8 +585,10 @@ namespace sum
    template void runSqr<int, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<int, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif

    template void runSqr<float, 1>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void runSqr<float, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<float, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<float, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
@@ -586,6 +597,7 @@ namespace sum
    template void runSqr<double, 2>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 3>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
    template void runSqr<double, 4>(PtrStepSzb src, void* buf, double* out, PtrStepSzb mask);
+#endif
 }

 /////////////////////////////////////////////////////////////
@@ -773,12 +785,16 @@ namespace minMax
    }

    template void run<uchar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void run<schar >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<ushort>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#endif
    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#endif
 }

 /////////////////////////////////////////////////////////////
@@ -955,12 +971,16 @@ namespace minMaxLoc
    }

    template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#endif
    template void run<int   >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
    template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#ifndef OPENCV_TINY_GPU_MODULE
    template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#endif
 }

 /////////////////////////////////////////////////////////////
@@ -1079,12 +1099,16 @@ namespace countNonZero
    }

    template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#ifndef OPENCV_TINY_GPU_MODULE
    template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
    template int run<int   >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#endif
    template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#ifndef OPENCV_TINY_GPU_MODULE
    template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#endif
 }

 //////////////////////////////////////////////////////////////////////////////
@@ -1257,6 +1281,11 @@ namespace reduce
        funcs[op]((PtrStepSz<T>) src, (D*) dst, stream);
    }

+#ifdef OPENCV_TINY_GPU_MODULE
+    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+    template void rows<float, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+#else
    template void rows<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, int, int>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
    template void rows<unsigned char, float, float>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
@@ -1280,6 +1309,7 @@ namespace reduce
    template void rows<float, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);

    template void rows<double, double, double>(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+#endif

    ///////////////////////////////////////////////////////////

@@ -1338,6 +1368,11 @@ namespace reduce
        funcs[cn][op](src, dst, stream);
    }

+#ifdef OPENCV_TINY_GPU_MODULE
+    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+    template void cols<float, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+#else
    template void cols<unsigned char, int, unsigned char>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, int, int>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
    template void cols<unsigned char, float, float>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
@@ -1361,6 +1396,7 @@ namespace reduce
    template void cols<float, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);

    template void cols<double, double, double>(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+#endif
 }

 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -197,6 +197,7 @@ namespace cv { namespace gpu { namespace device
        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -216,6 +217,7 @@ namespace cv { namespace gpu { namespace device
        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif

        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -166,6 +166,7 @@ namespace cv { namespace gpu { namespace device
        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace device
        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+#endif

        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -209,6 +209,7 @@ namespace cv { namespace gpu { namespace device
            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifdef OPENCV_TINY_GPU_MODULE
            static const caller_t callers[3][5] =
            {
                {
@@ -216,25 +217,55 @@ namespace cv { namespace gpu { namespace device
                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                    0/*RemapDispatcher<PointFilter, BrdWrap, T>::call*/,
                },
                {
                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                    0/*RemapDispatcher<LinearFilter, BrdWrap, T>::call*/,
+                },
+                {
+                    0/*RemapDispatcher<CubicFilter, BrdReflect101, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdReplicate, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdConstant, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdReflect, T>::call*/,
+                    0/*RemapDispatcher<CubicFilter, BrdWrap, T>::call*/,
+                }
+            };
+#else
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call,
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call,
                },
                {
                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call,
                }
            };
+#endif

-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+            const caller_t caller = callers[interpolation][borderMode];
+            if (!caller)
+                cv::gpu::error("Unsupported input parameters for remap", __FILE__, __LINE__, "");
+
+            caller(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }

@@ -243,6 +274,7 @@ namespace cv { namespace gpu { namespace device
        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -262,6 +294,7 @@ namespace cv { namespace gpu { namespace device
        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+#endif

        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -342,11 +342,13 @@ namespace cv { namespace gpu { namespace device
    template <> struct ResizeNearestDispatcher<uchar> : SelectImplForNearest<uchar> {};
    template <> struct ResizeNearestDispatcher<uchar4> : SelectImplForNearest<uchar4> {};

+#ifndef OPENCV_TINY_GPU_MODULE
    template <> struct ResizeNearestDispatcher<ushort> : SelectImplForNearest<ushort> {};
    template <> struct ResizeNearestDispatcher<ushort4> : SelectImplForNearest<ushort4> {};

    template <> struct ResizeNearestDispatcher<short> : SelectImplForNearest<short> {};
    template <> struct ResizeNearestDispatcher<short4> : SelectImplForNearest<short4> {};
+#endif

    template <> struct ResizeNearestDispatcher<float> : SelectImplForNearest<float> {};
    template <> struct ResizeNearestDispatcher<float4> : SelectImplForNearest<float4> {};
@@ -380,11 +382,13 @@ namespace cv { namespace gpu { namespace device
    template <> struct ResizeLinearDispatcher<uchar> : SelectImplForLinear<uchar> {};
    template <> struct ResizeLinearDispatcher<uchar4> : SelectImplForLinear<uchar4> {};

+#ifndef OPENCV_TINY_GPU_MODULE
    template <> struct ResizeLinearDispatcher<ushort> : SelectImplForLinear<ushort> {};
    template <> struct ResizeLinearDispatcher<ushort4> : SelectImplForLinear<ushort4> {};

    template <> struct ResizeLinearDispatcher<short> : SelectImplForLinear<short> {};
    template <> struct ResizeLinearDispatcher<short4> : SelectImplForLinear<short4> {};
+#endif

    template <> struct ResizeLinearDispatcher<float> : SelectImplForLinear<float> {};
    template <> struct ResizeLinearDispatcher<float4> : SelectImplForLinear<float4> {};
@@ -410,6 +414,7 @@ namespace cv { namespace gpu { namespace device
        }
    };

+#ifndef OPENCV_TINY_GPU_MODULE
    template <> struct ResizeCubicDispatcher<uchar> : SelectImplForCubic<uchar> {};
    template <> struct ResizeCubicDispatcher<uchar4> : SelectImplForCubic<uchar4> {};

@@ -421,6 +426,7 @@ namespace cv { namespace gpu { namespace device

    template <> struct ResizeCubicDispatcher<float> : SelectImplForCubic<float> {};
    template <> struct ResizeCubicDispatcher<float4> : SelectImplForCubic<float4> {};
+#endif

    // ResizeAreaDispatcher

@@ -467,7 +473,11 @@ namespace cv { namespace gpu { namespace device
        {
            ResizeNearestDispatcher<T>::call,
            ResizeLinearDispatcher<T>::call,
+#ifdef OPENCV_TINY_GPU_MODULE
+            0,
+#else
            ResizeCubicDispatcher<T>::call,
+#endif
            ResizeAreaDispatcher<T>::call
        };

@@ -475,13 +485,18 @@ namespace cv { namespace gpu { namespace device
        if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
            interpolation = 1;

-        funcs[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
+        const func_t func = funcs[interpolation];
+        if (!func)
+            cv::gpu::error("Unsupported input parameters for resize", __FILE__, __LINE__, "");
+
+        func(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), yoff, xoff, static_cast< PtrStepSz<T> >(dst), fy, fx, stream);
    }

    template void resize<uchar >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<uchar3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<uchar4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);

+#ifndef OPENCV_TINY_GPU_MODULE
    template void resize<ushort >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<ushort3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<ushort4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
@@ -489,6 +504,7 @@ namespace cv { namespace gpu { namespace device
    template void resize<short >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<short3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<short4>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+#endif

    template void resize<float >(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
    template void resize<float3>(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.4.cu
+++ b/modules/gpu/src/cuda/row_filter.4.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.8.cu
+++ b/modules/gpu/src/cuda/row_filter.8.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.9.cu
+++ b/modules/gpu/src/cuda/row_filter.9.cu
@@ -44,9 +44,13 @@

 #include "row_filter.h"

+#ifndef OPENCV_TINY_GPU_MODULE
+
 namespace filter
 {
    template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
 }

+#endif
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.h
+++ b/modules/gpu/src/cuda/row_filter.h
@@ -182,6 +182,186 @@ namespace filter
    {
        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowReflect101>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowReflect101>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowReflect101>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowReplicate>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowReplicate>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowReplicate>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowConstant>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowConstant>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowConstant>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                row_filter::caller< 3, T, D, BrdRowReflect>,
+                0,
+                row_filter::caller< 5, T, D, BrdRowReflect>,
+                0,
+                row_filter::caller< 7, T, D, BrdRowReflect>,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            },
+            {
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+                0,
+            }
+        };
+#else
        static const caller_t callers[5][33] =
        {
            {
@@ -360,12 +540,17 @@ namespace filter
                row_filter::caller<32, T, D, BrdRowWrap>
            }
        };
+#endif
+
+        const caller_t caller = callers[brd_type][ksize];
+        if (!caller)
+            cv::gpu::error("Unsupported input parameters for row_filter", __FILE__, __LINE__, "");

        if (stream == 0)
            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
        else
            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );

-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        caller((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
    }
 }
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -330,24 +330,55 @@ namespace cv { namespace gpu { namespace device

        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+        const static kernel_caller_t callers[] =
+        {
+            0,
+            kernel_caller< 1>,
+            kernel_caller< 2>,
+            kernel_caller< 3>,
+            kernel_caller< 4>,
+            kernel_caller< 5>,
+            0/*kernel_caller< 6>*/,
+            0/*kernel_caller< 7>*/,
+            0/*kernel_caller< 8>*/,
+            kernel_caller< 9>,
+            0/*kernel_caller<10>*/,
+            0/*kernel_caller<11>*/,
+            0/*kernel_caller<12>*/,
+            0/*kernel_caller<13>*/,
+            0/*kernel_caller<14>*/,
+            kernel_caller<15>,
+            0/*kernel_caller<16>*/,
+            0/*kernel_caller<17>*/,
+            0/*kernel_caller<18>*/,
+            0/*kernel_caller<19>*/,
+            0/*kernel_caller<20>*/,
+            0/*kernel_caller<21>*/,
+            0/*kernel_caller<22>*/,
+            0/*kernel_caller<23>*/,
+            0/*kernel_caller<24>*/,
+            0/*kernel_caller<25>*/,
+        };
+#else
        const static kernel_caller_t callers[] =
        {
            0,
            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
+            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<14>, kernel_caller<15>,
            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
-
-            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
        };
+#endif
+
        const int calles_num = sizeof(callers)/sizeof(callers[0]);

        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
        {
            int winsz2 = winsz >> 1;

-            if (winsz2 == 0 || winsz2 >= calles_num)
+            if (winsz2 == 0 || winsz2 >= calles_num || callers[winsz2] == 0)
                cv::gpu::error("Unsupported window size", __FILE__, __LINE__, "stereoBM_GPU");

            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -278,6 +278,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifdef OPENCV_TINY_GPU_MODULE
            static const func_t funcs[3][5] =
            {
                {
@@ -285,25 +286,55 @@ namespace cv { namespace gpu { namespace device
                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                    0/*WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call*/,
                },
                {
                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                    0/*WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call*/,
+                },
+                {
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call*/,
+                    0/*WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call*/,
+                }
+            };
+#else
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call,
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call,
                },
                {
                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call,
                }
            };
+#endif

-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+            const func_t func = funcs[interpolation][borderMode];
+            if (!func)
+                cv::gpu::error("Unsupported input parameters for warp_caller", __FILE__, __LINE__, "");
+
+            func(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
        }

@@ -320,6 +351,7 @@ namespace cv { namespace gpu { namespace device
        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -339,6 +371,7 @@ namespace cv { namespace gpu { namespace device
        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+#endif

        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -358,6 +391,7 @@ namespace cv { namespace gpu { namespace device
        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifndef OPENCV_TINY_GPU_MODULE
        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
@@ -377,6 +411,7 @@ namespace cv { namespace gpu { namespace device
        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+#endif

        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
--- a/modules/gpu/src/cvt_color_internal.h
+++ b/modules/gpu/src/cvt_color_internal.h
@@ -48,10 +48,16 @@ namespace cv { namespace gpu { namespace device
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

-#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+#else
+    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+#endif

 #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@@ -77,6 +77,17 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f

    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {bilateral_filter_gpu<uchar>       , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>       , bilateral_filter_gpu<uchar4>       },
+        {0 /*bilateral_filter_gpu<schar>*/ , 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/ , 0 /*bilateral_filter_gpu<schar4>*/ },
+        {0 /*bilateral_filter_gpu<ushort>*/, 0 /*bilateral_filter_gpu<ushort2>*/, 0 /*bilateral_filter_gpu<ushort3>*/, 0 /*bilateral_filter_gpu<ushort4>*/},
+        {0 /*bilateral_filter_gpu<short>*/ , 0 /*bilateral_filter_gpu<short2>*/ , 0 /*bilateral_filter_gpu<short3>*/ , 0 /*bilateral_filter_gpu<short4>*/ },
+        {0 /*bilateral_filter_gpu<int>*/   , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/   , 0 /*bilateral_filter_gpu<int4>*/   },
+        {bilateral_filter_gpu<float>       , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>       , bilateral_filter_gpu<float4>       }
+    };
+#else
    static const func_t funcs[6][4] =
    {
        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
@@ -86,6 +97,7 @@ void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, f
        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
    };
+#endif

    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -789,12 +789,14 @@ Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const
    case CV_8UC4:
        func = filter2D_gpu<uchar4, uchar4>;
        break;
+#ifndef OPENCV_TINY_GPU_MODULE
    case CV_16UC1:
        func = filter2D_gpu<ushort, ushort>;
        break;
    case CV_16UC4:
        func = filter2D_gpu<ushort4, ushort4>;
        break;
+#endif
    case CV_32FC1:
        func = filter2D_gpu<float, float>;
        break;
@@ -893,6 +895,18 @@ namespace

 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const gpuFilter1D_t funcs[7][4] =
+    {
+        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
+        {0, 0, 0, 0}
+    };
+#else
    static const gpuFilter1D_t funcs[7][4] =
    {
        {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>},
@@ -903,6 +917,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
        {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>},
        {0, 0, 0, 0}
    };
+#endif
    static const nppFilter1D_t npp_funcs[] =
    {
        0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R
@@ -998,6 +1013,18 @@ namespace

 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const gpuFilter1D_t funcs[7][4] =
+    {
+        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
+        {0, 0, 0, 0}
+    };
+#else
    static const gpuFilter1D_t funcs[7][4] =
    {
        {filter::linearColumn<float, uchar>, 0, filter::linearColumn<float3, uchar3>, filter::linearColumn<float4, uchar4>},
@@ -1008,6 +1035,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
        {filter::linearColumn<float, float>, 0, filter::linearColumn<float3, float3>, filter::linearColumn<float4, float4>},
        {0, 0, 0, 0}
    };
+#endif
    static const nppFilter1D_t npp_funcs[] =
    {
        0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -336,6 +336,17 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
 #endif
    {
        typedef void (*caller_t)(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const caller_t callers[6][4] =
+        {
+            {   copyMakeBorder_caller<uchar, 1>  ,  copyMakeBorder_caller<uchar, 2>     ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {   copyMakeBorder_caller<float, 1>  ,  0/*copyMakeBorder_caller<float, 2>*/,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
+        };
+#else
        static const caller_t callers[6][4] =
        {
            {   copyMakeBorder_caller<uchar, 1>  ,    copyMakeBorder_caller<uchar, 2>   ,    copyMakeBorder_caller<uchar, 3>  ,    copyMakeBorder_caller<uchar, 4>},
@@ -345,6 +356,7 @@ void cv::gpu::copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bottom
            {0/*copyMakeBorder_caller<int,   1>*/, 0/*copyMakeBorder_caller<int,   2>*/ , 0/*copyMakeBorder_caller<int,   3>*/, 0/*copyMakeBorder_caller<int  , 4>*/},
            {   copyMakeBorder_caller<float, 1>  , 0/*copyMakeBorder_caller<float, 2>*/ ,    copyMakeBorder_caller<float, 3>  ,    copyMakeBorder_caller<float ,4>}
        };
+#endif

        caller_t func = callers[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -261,6 +261,18 @@ Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::run<uchar , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, ::sum::run<float , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+    };
+#else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::run<uchar , 1>, ::sum::run<uchar , 2>, ::sum::run<uchar , 3>, ::sum::run<uchar , 4>},
@@ -271,6 +283,7 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
        {0, ::sum::run<float , 1>, ::sum::run<float , 2>, ::sum::run<float , 3>, ::sum::run<float , 4>},
        {0, ::sum::run<double, 1>, ::sum::run<double, 2>, ::sum::run<double, 3>, ::sum::run<double, 4>}
    };
+#endif

    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

@@ -286,6 +299,8 @@ Scalar cv::gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    buf.setTo(Scalar::all(0));

    const func_t func = funcs[src.depth()][src.channels()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    double result[4];
    func(src, buf.data, result, mask);
@@ -307,6 +322,18 @@ Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::runAbs<uchar , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, ::sum::runAbs<float , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+    };
+#else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runAbs<uchar , 1>, ::sum::runAbs<uchar , 2>, ::sum::runAbs<uchar , 3>, ::sum::runAbs<uchar , 4>},
@@ -317,6 +344,7 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
        {0, ::sum::runAbs<float , 1>, ::sum::runAbs<float , 2>, ::sum::runAbs<float , 3>, ::sum::runAbs<float , 4>},
        {0, ::sum::runAbs<double, 1>, ::sum::runAbs<double, 2>, ::sum::runAbs<double, 3>, ::sum::runAbs<double, 4>}
    };
+#endif

    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

@@ -332,6 +360,8 @@ Scalar cv::gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    buf.setTo(Scalar::all(0));

    const func_t func = funcs[src.depth()][src.channels()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    double result[4];
    func(src, buf.data, result, mask);
@@ -353,6 +383,18 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
 Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(PtrStepSzb src, void* buf, double* sum, PtrStepSzb mask);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[7][5] =
+    {
+        {0, ::sum::runSqr<uchar , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+        {0, ::sum::runSqr<float , 1>, 0, 0, 0},
+        {0, 0, 0, 0, 0},
+    };
+#else
    static const func_t funcs[7][5] =
    {
        {0, ::sum::runSqr<uchar , 1>, ::sum::runSqr<uchar , 2>, ::sum::runSqr<uchar , 3>, ::sum::runSqr<uchar , 4>},
@@ -363,6 +405,7 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
        {0, ::sum::runSqr<float , 1>, ::sum::runSqr<float , 2>, ::sum::runSqr<float , 3>, ::sum::runSqr<float , 4>},
        {0, ::sum::runSqr<double, 1>, ::sum::runSqr<double, 2>, ::sum::runSqr<double, 3>, ::sum::runSqr<double, 4>}
    };
+#endif

    CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == src.size()) );

@@ -378,6 +421,8 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
    buf.setTo(Scalar::all(0));

    const func_t func = funcs[src.depth()][src.channels()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    double result[4];
    func(src, buf.data, result, mask);
@@ -405,6 +450,18 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        ::minMax::run<uchar>,
+        0/*::minMax::run<schar>*/,
+        0/*::minMax::run<ushort>*/,
+        0/*::minMax::run<short>*/,
+        0/*::minMax::run<int>*/,
+        ::minMax::run<float>,
+        0/*::minMax::run<double>*/,
+    };
+#else
    static const func_t funcs[] =
    {
        ::minMax::run<uchar>,
@@ -413,8 +470,9 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
        ::minMax::run<short>,
        ::minMax::run<int>,
        ::minMax::run<float>,
-        ::minMax::run<double>
+        ::minMax::run<double>,
    };
+#endif

    CV_Assert( src.channels() == 1 );
    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
@@ -430,6 +488,8 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    const func_t func = funcs[src.depth()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    double temp1, temp2;
    func(src, mask, minVal ? minVal : &temp1, maxVal ? maxVal : &temp2, buf);
@@ -456,6 +516,18 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
    typedef void (*func_t)(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        ::minMaxLoc::run<uchar>,
+        0/*::minMaxLoc::run<schar>*/,
+        0/*::minMaxLoc::run<ushort>*/,
+        0/*::minMaxLoc::run<short>*/,
+        ::minMaxLoc::run<int>,
+        ::minMaxLoc::run<float>,
+        0/*::minMaxLoc::run<double>*/,
+    };
+#else
    static const func_t funcs[] =
    {
        ::minMaxLoc::run<uchar>,
@@ -464,8 +536,9 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
        ::minMaxLoc::run<short>,
        ::minMaxLoc::run<int>,
        ::minMaxLoc::run<float>,
-        ::minMaxLoc::run<double>
+        ::minMaxLoc::run<double>,
    };
+#endif

    CV_Assert( src.channels() == 1 );
    CV_Assert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
@@ -482,6 +555,8 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    ensureSizeIsEnough(locbuf_size, CV_8U, locBuf);

    const func_t func = funcs[src.depth()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    double temp1, temp2;
    Point temp3, temp4;
@@ -508,6 +583,18 @@ int cv::gpu::countNonZero(const GpuMat& src)
 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
    typedef int (*func_t)(const PtrStepSzb src, PtrStep<unsigned int> buf);
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[] =
+    {
+        ::countNonZero::run<uchar>,
+        0/*::countNonZero::run<schar>*/,
+        0/*::countNonZero::run<ushort>*/,
+        0/*::countNonZero::run<short>*/,
+        0/*::countNonZero::run<int>*/,
+        ::countNonZero::run<float>,
+        0/*::countNonZero::run<double>*/,
+    };
+#else
    static const func_t funcs[] =
    {
        ::countNonZero::run<uchar>,
@@ -516,8 +603,9 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
        ::countNonZero::run<short>,
        ::countNonZero::run<int>,
        ::countNonZero::run<float>,
-        ::countNonZero::run<double>
+        ::countNonZero::run<double>,
    };
+#endif

    CV_Assert(src.channels() == 1);

@@ -532,6 +620,8 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    const func_t func = funcs[src.depth()];
+    if (!func)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported combination of source and destination types");

    return func(src, buf);
 }
@@ -562,6 +652,74 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    if (dim == 0)
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, int op, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[7][7] =
+        {
+            {
+                ::reduce::rows<unsigned char, int, unsigned char>,
+                0/*::reduce::rows<unsigned char, int, signed char>*/,
+                0/*::reduce::rows<unsigned char, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned char, int, short>*/,
+                0/*::reduce::rows<unsigned char, int, int>*/,
+                ::reduce::rows<unsigned char, float, float>,
+                0/*::reduce::rows<unsigned char, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<signed char, int, unsigned char>*/,
+                0/*::reduce::rows<signed char, int, signed char>*/,
+                0/*::reduce::rows<signed char, int, unsigned short>*/,
+                0/*::reduce::rows<signed char, int, short>*/,
+                0/*::reduce::rows<signed char, int, int>*/,
+                0/*::reduce::rows<signed char, float, float>*/,
+                0/*::reduce::rows<signed char, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
+                0/*::reduce::rows<unsigned short, int, signed char>*/,
+                0/*::reduce::rows<unsigned short, int, unsigned short>*/,
+                0/*::reduce::rows<unsigned short, int, short>*/,
+                0/*::reduce::rows<unsigned short, int, int>*/,
+                0/*::reduce::rows<unsigned short, float, float>*/,
+                0/*::reduce::rows<unsigned short, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<short, int, unsigned char>*/,
+                0/*::reduce::rows<short, int, signed char>*/,
+                0/*::reduce::rows<short, int, unsigned short>*/,
+                0/*::reduce::rows<short, int, short>*/,
+                0/*::reduce::rows<short, int, int>*/,
+                0/*::reduce::rows<short, float, float>*/,
+                0/*::reduce::rows<short, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<int, int, unsigned char>*/,
+                0/*::reduce::rows<int, int, signed char>*/,
+                0/*::reduce::rows<int, int, unsigned short>*/,
+                0/*::reduce::rows<int, int, short>*/,
+                0/*::reduce::rows<int, int, int>*/,
+                0/*::reduce::rows<int, float, float>*/,
+                0/*::reduce::rows<int, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<float, float, unsigned char>*/,
+                0/*::reduce::rows<float, float, signed char>*/,
+                0/*::reduce::rows<float, float, unsigned short>*/,
+                0/*::reduce::rows<float, float, short>*/,
+                0/*::reduce::rows<float, float, int>*/,
+                ::reduce::rows<float, float, float>,
+                0/*::reduce::rows<float, double, double>*/,
+            },
+            {
+                0/*::reduce::rows<double, double, unsigned char>*/,
+                0/*::reduce::rows<double, double, signed char>*/,
+                0/*::reduce::rows<double, double, unsigned short>*/,
+                0/*::reduce::rows<double, double, short>*/,
+                0/*::reduce::rows<double, double, int>*/,
+                0/*::reduce::rows<double, double, float>*/,
+                0/*::reduce::rows<double, double, double>*/,
+            }
+        };
+#else
        static const func_t funcs[7][7] =
        {
            {
@@ -571,7 +729,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<unsigned char, int, short>*/,
                ::reduce::rows<unsigned char, int, int>,
                ::reduce::rows<unsigned char, float, float>,
-                ::reduce::rows<unsigned char, double, double>
+                ::reduce::rows<unsigned char, double, double>,
            },
            {
                0/*::reduce::rows<signed char, int, unsigned char>*/,
@@ -580,7 +738,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<signed char, int, short>*/,
                0/*::reduce::rows<signed char, int, int>*/,
                0/*::reduce::rows<signed char, float, float>*/,
-                0/*::reduce::rows<signed char, double, double>*/
+                0/*::reduce::rows<signed char, double, double>*/,
            },
            {
                0/*::reduce::rows<unsigned short, int, unsigned char>*/,
@@ -589,7 +747,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<unsigned short, int, short>*/,
                ::reduce::rows<unsigned short, int, int>,
                ::reduce::rows<unsigned short, float, float>,
-                ::reduce::rows<unsigned short, double, double>
+                ::reduce::rows<unsigned short, double, double>,
            },
            {
                0/*::reduce::rows<short, int, unsigned char>*/,
@@ -598,7 +756,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                ::reduce::rows<short, int, short>,
                ::reduce::rows<short, int, int>,
                ::reduce::rows<short, float, float>,
-                ::reduce::rows<short, double, double>
+                ::reduce::rows<short, double, double>,
            },
            {
                0/*::reduce::rows<int, int, unsigned char>*/,
@@ -607,7 +765,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<int, int, short>*/,
                ::reduce::rows<int, int, int>,
                ::reduce::rows<int, float, float>,
-                ::reduce::rows<int, double, double>
+                ::reduce::rows<int, double, double>,
            },
            {
                0/*::reduce::rows<float, float, unsigned char>*/,
@@ -616,7 +774,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<float, float, short>*/,
                0/*::reduce::rows<float, float, int>*/,
                ::reduce::rows<float, float, float>,
-                ::reduce::rows<float, double, double>
+                ::reduce::rows<float, double, double>,
            },
            {
                0/*::reduce::rows<double, double, unsigned char>*/,
@@ -625,9 +783,10 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::rows<double, double, short>*/,
                0/*::reduce::rows<double, double, int>*/,
                0/*::reduce::rows<double, double, float>*/,
-                ::reduce::rows<double, double, double>
+                ::reduce::rows<double, double, double>,
            }
        };
+#endif

        const func_t func = funcs[src.depth()][dst.depth()];

@@ -639,6 +798,74 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
    else
    {
        typedef void (*func_t)(PtrStepSzb src, void* dst, int cn, int op, cudaStream_t stream);
+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[7][7] =
+        {
+            {
+                ::reduce::cols<unsigned char, int, unsigned char>,
+                0/*::reduce::cols<unsigned char, int, signed char>*/,
+                0/*::reduce::cols<unsigned char, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned char, int, short>*/,
+                0/*::reduce::cols<unsigned char, int, int>*/,
+                ::reduce::cols<unsigned char, float, float>,
+                0/*::reduce::cols<unsigned char, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<signed char, int, unsigned char>*/,
+                0/*::reduce::cols<signed char, int, signed char>*/,
+                0/*::reduce::cols<signed char, int, unsigned short>*/,
+                0/*::reduce::cols<signed char, int, short>*/,
+                0/*::reduce::cols<signed char, int, int>*/,
+                0/*::reduce::cols<signed char, float, float>*/,
+                0/*::reduce::cols<signed char, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
+                0/*::reduce::cols<unsigned short, int, signed char>*/,
+                0/*::reduce::cols<unsigned short, int, unsigned short>*/,
+                0/*::reduce::cols<unsigned short, int, short>*/,
+                0/*::reduce::cols<unsigned short, int, int>*/,
+                0/*::reduce::cols<unsigned short, float, float>*/,
+                0/*::reduce::cols<unsigned short, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<short, int, unsigned char>*/,
+                0/*::reduce::cols<short, int, signed char>*/,
+                0/*::reduce::cols<short, int, unsigned short>*/,
+                0/*::reduce::cols<short, int, short>*/,
+                0/*::reduce::cols<short, int, int>*/,
+                0/*::reduce::cols<short, float, float>*/,
+                0/*::reduce::cols<short, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<int, int, unsigned char>*/,
+                0/*::reduce::cols<int, int, signed char>*/,
+                0/*::reduce::cols<int, int, unsigned short>*/,
+                0/*::reduce::cols<int, int, short>*/,
+                0/*::reduce::cols<int, int, int>*/,
+                0/*::reduce::cols<int, float, float>*/,
+                0/*::reduce::cols<int, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<float, float, unsigned char>*/,
+                0/*::reduce::cols<float, float, signed char>*/,
+                0/*::reduce::cols<float, float, unsigned short>*/,
+                0/*::reduce::cols<float, float, short>*/,
+                0/*::reduce::cols<float, float, int>*/,
+                ::reduce::cols<float, float, float>,
+                0/*::reduce::cols<float, double, double>*/,
+            },
+            {
+                0/*::reduce::cols<double, double, unsigned char>*/,
+                0/*::reduce::cols<double, double, signed char>*/,
+                0/*::reduce::cols<double, double, unsigned short>*/,
+                0/*::reduce::cols<double, double, short>*/,
+                0/*::reduce::cols<double, double, int>*/,
+                0/*::reduce::cols<double, double, float>*/,
+                0/*::reduce::cols<double, double, double>*/,
+            }
+        };
+#else
        static const func_t funcs[7][7] =
        {
            {
@@ -648,7 +875,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<unsigned char, int, short>*/,
                ::reduce::cols<unsigned char, int, int>,
                ::reduce::cols<unsigned char, float, float>,
-                ::reduce::cols<unsigned char, double, double>
+                ::reduce::cols<unsigned char, double, double>,
            },
            {
                0/*::reduce::cols<signed char, int, unsigned char>*/,
@@ -657,7 +884,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<signed char, int, short>*/,
                0/*::reduce::cols<signed char, int, int>*/,
                0/*::reduce::cols<signed char, float, float>*/,
-                0/*::reduce::cols<signed char, double, double>*/
+                0/*::reduce::cols<signed char, double, double>*/,
            },
            {
                0/*::reduce::cols<unsigned short, int, unsigned char>*/,
@@ -666,7 +893,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<unsigned short, int, short>*/,
                ::reduce::cols<unsigned short, int, int>,
                ::reduce::cols<unsigned short, float, float>,
-                ::reduce::cols<unsigned short, double, double>
+                ::reduce::cols<unsigned short, double, double>,
            },
            {
                0/*::reduce::cols<short, int, unsigned char>*/,
@@ -675,7 +902,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                ::reduce::cols<short, int, short>,
                ::reduce::cols<short, int, int>,
                ::reduce::cols<short, float, float>,
-                ::reduce::cols<short, double, double>
+                ::reduce::cols<short, double, double>,
            },
            {
                0/*::reduce::cols<int, int, unsigned char>*/,
@@ -684,7 +911,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<int, int, short>*/,
                ::reduce::cols<int, int, int>,
                ::reduce::cols<int, float, float>,
-                ::reduce::cols<int, double, double>
+                ::reduce::cols<int, double, double>,
            },
            {
                0/*::reduce::cols<float, float, unsigned char>*/,
@@ -693,7 +920,7 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<float, float, short>*/,
                0/*::reduce::cols<float, float, int>*/,
                ::reduce::cols<float, float, float>,
-                ::reduce::cols<float, double, double>
+                ::reduce::cols<float, double, double>,
            },
            {
                0/*::reduce::cols<double, double, unsigned char>*/,
@@ -702,9 +929,10 @@ void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int
                0/*::reduce::cols<double, double, short>*/,
                0/*::reduce::cols<double, double, int>*/,
                0/*::reduce::cols<double, double, float>*/,
-                ::reduce::cols<double, double, double>
+                ::reduce::cols<double, double, double>,
            }
        };
+#endif

        const func_t func = funcs[src.depth()][dst.depth()];

--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
@@ -68,6 +68,17 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)

    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
+    };
+#else
    static const func_t funcs[6][4] =
    {
        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
@@ -77,6 +88,7 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
    };
+#endif

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);

@@ -106,6 +118,17 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)

    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
+    };
+#else
    static const func_t funcs[6][4] =
    {
        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
@@ -115,6 +138,7 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
    };
+#endif

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);

--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@@ -65,6 +65,17 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
+    };
+#else
    static const func_t funcs[6][4] =
    {
        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
@@ -74,6 +85,7 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
    };
+#endif

    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -57,6 +57,18 @@ namespace cv { namespace gpu { namespace device
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
 {
    typedef void (*func_t)(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+#ifdef OPENCV_TINY_GPU_MODULE
+    static const func_t funcs[6][4] =
+    {
+        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {0, 0, 0, 0},
+        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
+    };
+#else
    static const func_t funcs[6][4] =
    {
        {device::resize<uchar>      , 0 /*device::resize<uchar2>*/ , device::resize<uchar3>     , device::resize<uchar4>     },
@@ -66,6 +78,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
        {0 /*device::resize<int>*/  , 0 /*device::resize<int2>*/   , 0 /*device::resize<int3>*/ , 0 /*device::resize<int4>*/ },
        {device::resize<float>      , 0 /*device::resize<float2>*/ , device::resize<float3>     , device::resize<float4>     }
    };
+#endif

    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
    CV_Assert( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_AREA );
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@@ -277,6 +277,17 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[6][4] =
+        {
+            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
+        };
+#else
        static const func_t funcs[6][4] =
        {
            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
@@ -286,6 +297,7 @@ void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsiz
            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
        };
+#endif

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
@@ -415,6 +427,17 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

+#ifdef OPENCV_TINY_GPU_MODULE
+        static const func_t funcs[6][4] =
+        {
+            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {0, 0, 0, 0},
+            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
+        };
+#else
        static const func_t funcs[6][4] =
        {
            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
@@ -424,6 +447,7 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
        };
+#endif

        const func_t func = funcs[src.depth()][src.channels() - 1];
        CV_Assert(func != 0);
--- a/modules/gpu/test/test_color.cpp
+++ b/modules/gpu/test/test_color.cpp
@@ -2285,11 +2285,19 @@ GPU_TEST_P(CvtColor, BayerGR2Gray)
    EXPECT_MAT_NEAR(dst_gold(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), dst(cv::Rect(1, 1, dst.cols - 2, dst.rows - 2)), 2);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CvtColor, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32F)),
    WHOLE_SUBMAT));
+#endif

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Demosaicing
--- a/modules/gpu/test/test_copy_make_border.cpp
+++ b/modules/gpu/test/test_copy_make_border.cpp
@@ -87,6 +87,20 @@ GPU_TEST_P(CopyMakeBorder, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1),
+                    MatType(CV_8UC3),
+                    MatType(CV_8UC4),
+                    MatType(CV_32FC1),
+                    MatType(CV_32FC3),
+                    MatType(CV_32FC4)),
+    testing::Values(Border(1), Border(10), Border(50)),
+    ALL_BORDER_TYPES,
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -102,5 +116,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, CopyMakeBorder, testing::Combine(
    testing::Values(Border(1), Border(10), Border(50)),
    ALL_BORDER_TYPES,
    WHOLE_SUBMAT));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_core.cpp
+++ b/modules/gpu/test/test_core.cpp
@@ -1341,11 +1341,19 @@ GPU_TEST_P(Abs, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Abs, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_16S), MatDepth(CV_32F)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // Sqr
@@ -1381,6 +1389,13 @@ GPU_TEST_P(Sqr, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -1389,6 +1404,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqr, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // Sqrt
@@ -1451,6 +1467,13 @@ GPU_TEST_P(Sqrt, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-5);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -1459,6 +1482,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Sqrt, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // Log
@@ -1521,6 +1545,13 @@ GPU_TEST_P(Log, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-6);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -1529,6 +1560,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Log, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // Exp
@@ -1601,6 +1633,13 @@ GPU_TEST_P(Exp, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, depth < CV_32F ? 1.0 : 1e-2);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -1609,6 +1648,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Exp, testing::Combine(
                    MatDepth(CV_16S),
                    MatDepth(CV_32F)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // Compare_Array
@@ -1775,12 +1815,21 @@ GPU_TEST_P(Compare_Scalar, Accuracy)
    }
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    CmpCode::all(),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Compare_Scalar, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    TYPES(CV_8U, CV_64F, 1, 4),
    CmpCode::all(),
    WHOLE_SUBMAT));
+#endif

 //////////////////////////////////////////////////////////////////////////////
 // Bitwise_Array
@@ -1936,11 +1985,19 @@ GPU_TEST_P(Bitwise_Scalar, Xor)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(Channels(1))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Bitwise_Scalar, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_32S)),
    IMAGE_CHANNELS));
+#endif

 //////////////////////////////////////////////////////////////////////////////
 // RShift
@@ -2317,11 +2374,19 @@ GPU_TEST_P(Pow, Accuracy)
    }
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Pow, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    ALL_DEPTH,
    WHOLE_SUBMAT));
+#endif

 //////////////////////////////////////////////////////////////////////////////
 // AddWeighted
@@ -2380,6 +2445,23 @@ GPU_TEST_P(AddWeighted, Accuracy)
    }
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core_1, AddWeighted, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(MatDepth(CV_8U)),
+    WHOLE_SUBMAT));
+
+INSTANTIATE_TEST_CASE_P(GPU_Core_2, AddWeighted, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_32F)),
+    testing::Values(MatDepth(CV_32F)),
+    testing::Values(MatDepth(CV_32F)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -2387,6 +2469,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, AddWeighted, testing::Combine(
    ALL_DEPTH,
    ALL_DEPTH,
    WHOLE_SUBMAT));
+#endif

 //////////////////////////////////////////////////////////////////////////////
 // GEMM
@@ -2953,6 +3036,15 @@ GPU_TEST_P(Norm, Accuracy)
    EXPECT_NEAR(val_gold, val, depth < CV_32F ? 0.0 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_32F)),
+    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -2964,6 +3056,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Norm, testing::Combine(
                    MatDepth(CV_32F)),
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2), NormCode(cv::NORM_INF)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // normDiff
@@ -3136,11 +3229,19 @@ GPU_TEST_P(Sum, Sqr)
    EXPECT_SCALAR_NEAR(val_gold, val, CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.5);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Sum, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    TYPES(CV_8U, CV_64F, 1, 4),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // MinMax
@@ -3513,11 +3614,19 @@ PARAM_TEST_CASE(Reduce, cv::gpu::DeviceInfo, cv::Size, MatDepth, Channels, Reduc
        type = CV_MAKE_TYPE(depth, channels);

        if (reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN)
+        {
            dst_depth = depth;
+        }
+#ifndef OPENCV_TINY_GPU_MODULE
        else if (reduceOp == CV_REDUCE_SUM)
+        {
            dst_depth = depth == CV_8U ? CV_32S : depth < CV_64F ? CV_32F : depth;
+        }
+#endif
        else
+        {
            dst_depth = depth < CV_32F ? CV_32F : depth;
+        }

        dst_type = CV_MAKE_TYPE(dst_depth, channels);
    }
@@ -3553,6 +3662,16 @@ GPU_TEST_P(Reduce, Cols)
    EXPECT_MAT_NEAR(dst_gold, dst, dst_depth < CV_32F ? 0.0 : 0.02);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U),
+                    MatDepth(CV_32F)),
+    ALL_CHANNELS,
+    ALL_REDUCE_CODES,
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -3564,6 +3683,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Core, Reduce, testing::Combine(
    ALL_CHANNELS,
    ALL_REDUCE_CODES,
    WHOLE_SUBMAT));
+#endif

 //////////////////////////////////////////////////////////////////////////////
 // Normalize
--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
@@ -310,6 +310,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Single)
    ASSERT_EQ(0, badCount);
 }

+#ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, Match_Collection)
 {
    cv::gpu::BFMatcher_GPU matcher(normCode);
@@ -363,6 +364,7 @@ GPU_TEST_P(BruteForceMatcher, Match_Collection)

    ASSERT_EQ(0, badCount);
 }
+#endif

 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Single)
 {
@@ -442,6 +444,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Single)
    ASSERT_EQ(0, badCount);
 }

+#ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, KnnMatch_2_Collection)
 {
    cv::gpu::BFMatcher_GPU matcher(normCode);
@@ -565,6 +568,7 @@ GPU_TEST_P(BruteForceMatcher, KnnMatch_3_Collection)

    ASSERT_EQ(0, badCount);
 }
+#endif

 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
 {
@@ -615,6 +619,7 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Single)
    }
 }

+#ifndef OPENCV_TINY_GPU_MODULE
 GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
 {
    cv::gpu::BFMatcher_GPU matcher(normCode);
@@ -693,11 +698,20 @@ GPU_TEST_P(BruteForceMatcher, RadiusMatch_Collection)
        ASSERT_EQ(0, badCount);
    }
 }
+#endif

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(NormCode(cv::NORM_L2)),
+    testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
+    testing::Values(UseMask(false), UseMask(true))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Features2D, BruteForceMatcher, testing::Combine(
    ALL_DEVICES,
    testing::Values(NormCode(cv::NORM_L1), NormCode(cv::NORM_L2)),
    testing::Values(DescriptorSize(57), DescriptorSize(64), DescriptorSize(83), DescriptorSize(128), DescriptorSize(179), DescriptorSize(256), DescriptorSize(304)),
    testing::Values(UseMask(false), UseMask(true))));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@@ -164,6 +164,21 @@ GPU_TEST_P(Sobel, Accuracy)
    EXPECT_MAT_NEAR(getInnerROI(dst_gold, ksize), getInnerROI(dst, ksize), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7))),
+    testing::Values(Deriv_X(0), Deriv_X(1), Deriv_X(2)),
+    testing::Values(Deriv_Y(0), Deriv_Y(1), Deriv_Y(2)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -177,6 +192,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Sobel, testing::Combine(
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
+#endif

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Scharr
@@ -227,6 +243,20 @@ GPU_TEST_P(Scharr, Accuracy)
    EXPECT_MAT_NEAR(getInnerROI(dst_gold, cv::Size(3, 3)), getInnerROI(dst, cv::Size(3, 3)), CV_MAT_DEPTH(type) < CV_32F ? 0.0 : 0.1);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(Deriv_X(0), Deriv_X(1)),
+    testing::Values(Deriv_Y(0), Deriv_Y(1)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -239,6 +269,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Scharr, testing::Combine(
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
+#endif

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // GaussianBlur
@@ -301,6 +332,21 @@ GPU_TEST_P(GaussianBlur, Accuracy)
    }
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(KSize(cv::Size(3, 3)),
+                    KSize(cv::Size(5, 5)),
+                    KSize(cv::Size(7, 7))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101),
+                    BorderType(cv::BORDER_REPLICATE),
+                    BorderType(cv::BORDER_CONSTANT),
+                    BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -326,6 +372,7 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, GaussianBlur, testing::Combine(
                    BorderType(cv::BORDER_CONSTANT),
                    BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
+#endif

 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Laplacian
@@ -565,6 +612,16 @@ GPU_TEST_P(Filter2D, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, CV_MAT_DEPTH(type) == CV_32F ? 1e-1 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC4)),
+    testing::Values(KSize(cv::Size(3, 3)), KSize(cv::Size(5, 5)), KSize(cv::Size(7, 7)), KSize(cv::Size(11, 11)), KSize(cv::Size(13, 13)), KSize(cv::Size(15, 15))),
+    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -573,5 +630,6 @@ INSTANTIATE_TEST_CASE_P(GPU_Filter, Filter2D, testing::Combine(
    testing::Values(Anchor(cv::Point(-1, -1)), Anchor(cv::Point(0, 0)), Anchor(cv::Point(2, 2))),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
    WHOLE_SUBMAT));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -357,11 +357,19 @@ GPU_TEST_P(Canny, Accuracy)
    }
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(AppertureSize(3)),
+    testing::Values(L2gradient(false), L2gradient(true)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
    ALL_DEVICES,
    testing::Values(AppertureSize(3), AppertureSize(5)),
    testing::Values(L2gradient(false), L2gradient(true)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////////////////////////////
 // MeanShift
--- a/modules/gpu/test/test_pyramids.cpp
+++ b/modules/gpu/test/test_pyramids.cpp
@@ -80,11 +80,19 @@ GPU_TEST_P(PyrDown, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrDown, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
+#endif

 ////////////////////////////////////////////////////////
 // pyrUp
@@ -120,10 +128,18 @@ GPU_TEST_P(PyrUp, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-4 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, PyrUp, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    WHOLE_SUBMAT));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_remap.cpp
+++ b/modules/gpu/test/test_remap.cpp
@@ -169,6 +169,15 @@ GPU_TEST_P(Remap, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-3 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -176,5 +185,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Remap, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
    WHOLE_SUBMAT));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_resize.cpp
+++ b/modules/gpu/test/test_resize.cpp
@@ -174,6 +174,15 @@ GPU_TEST_P(Resize, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -181,6 +190,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Resize, testing::Combine(
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    WHOLE_SUBMAT));
+#endif

 /////////////////

@@ -221,6 +231,15 @@ GPU_TEST_P(ResizeSameAsHost, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-2 : src.depth() == CV_8U ? 4.0 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -228,7 +247,17 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, ResizeSameAsHost, testing::Combine(
    testing::Values(0.3, 0.5),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA)),
    WHOLE_SUBMAT));
+#endif

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    testing::Values(0.3, 0.5, 1.5, 2.0),
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -236,5 +265,6 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc2, ResizeSameAsHost, testing::Combine(
    testing::Values(0.3, 0.5, 1.5, 2.0),
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
    WHOLE_SUBMAT));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_threshold.cpp
+++ b/modules/gpu/test/test_threshold.cpp
@@ -83,11 +83,20 @@ GPU_TEST_P(Threshold, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_32FC1)),
+    ThreshOp::all(),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Threshold, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    testing::Values(MatType(CV_8UC1), MatType(CV_16SC1), MatType(CV_32FC1)),
    ThreshOp::all(),
    WHOLE_SUBMAT));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_warp_affine.cpp
+++ b/modules/gpu/test/test_warp_affine.cpp
@@ -222,6 +222,16 @@ GPU_TEST_P(WarpAffine, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -230,6 +240,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffine, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
    WHOLE_SUBMAT));
+#endif

 ///////////////////////////////////////////////////////////////////
 // Test NPP
@@ -271,10 +282,18 @@ GPU_TEST_P(WarpAffineNPP, Accuracy)
    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpAffineNPP, testing::Combine(
    ALL_DEVICES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+#endif

 #endif // HAVE_CUDA
--- a/modules/gpu/test/test_warp_perspective.cpp
+++ b/modules/gpu/test/test_warp_perspective.cpp
@@ -225,6 +225,16 @@ GPU_TEST_P(WarpPerspective, Accuracy)
    EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
+    ALL_DEVICES,
+    DIFFERENT_SIZES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR)),
+    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT)),
+    WHOLE_SUBMAT));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
@@ -233,6 +243,7 @@ INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspective, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)),
    testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP)),
    WHOLE_SUBMAT));
+#endif

 ///////////////////////////////////////////////////////////////////
 // Test NPP
@@ -274,10 +285,18 @@ GPU_TEST_P(WarpPerspectiveNPP, Accuracy)
    EXPECT_MAT_SIMILAR(dst_gold, dst, 2e-2);
 }

+#ifdef OPENCV_TINY_GPU_MODULE
+INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
+    DIRECT_INVERSE,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR))));
+#else
 INSTANTIATE_TEST_CASE_P(GPU_ImgProc, WarpPerspectiveNPP, testing::Combine(
    ALL_DEVICES,
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4), MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    DIRECT_INVERSE,
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+#endif

 #endif // HAVE_CUDA
--- a/modules/ts/include/opencv2/ts/gpu_perf.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_perf.hpp
@@ -50,8 +50,13 @@

 namespace perf
 {
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define ALL_BORDER_MODES testing::Values(BorderMode(cv::BORDER_REFLECT101), BorderMode(cv::BORDER_REPLICATE), BorderMode(cv::BORDER_CONSTANT), BorderMode(cv::BORDER_REFLECT))
+    #define ALL_INTERPOLATIONS testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_AREA))
+#else
    #define ALL_BORDER_MODES BorderMode::all()
    #define ALL_INTERPOLATIONS Interpolation::all()
+#endif

    CV_ENUM(BorderMode, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
    CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)
--- a/modules/ts/include/opencv2/ts/gpu_test.hpp
+++ b/modules/ts/include/opencv2/ts/gpu_test.hpp
@@ -215,6 +215,12 @@ namespace cvtest

    using perf::MatDepth;

+#ifdef OPENCV_TINY_GPU_MODULE
+    #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_32F))
+
+    #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
+                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_32F)))
+#else
    #define ALL_DEPTH testing::Values(MatDepth(CV_8U), MatDepth(CV_8S), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32S), MatDepth(CV_32F), MatDepth(CV_64F))

    #define DEPTH_PAIRS testing::Values(std::make_pair(MatDepth(CV_8U), MatDepth(CV_8U)),   \
@@ -242,6 +248,7 @@ namespace cvtest
                                        std::make_pair(MatDepth(CV_32F), MatDepth(CV_64F)), \
                                                                                            \
                                        std::make_pair(MatDepth(CV_64F), MatDepth(CV_64F)))
+#endif

    // Type

@@ -318,7 +325,11 @@ namespace cvtest
    CV_ENUM(Interpolation, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, INTER_AREA)

    CV_ENUM(BorderType, BORDER_REFLECT101, BORDER_REPLICATE, BORDER_CONSTANT, BORDER_REFLECT, BORDER_WRAP)
+#ifdef OPENCV_TINY_GPU_MODULE
+    #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT))
+#else
    #define ALL_BORDER_TYPES testing::Values(BorderType(cv::BORDER_REFLECT101), BorderType(cv::BORDER_REPLICATE), BorderType(cv::BORDER_CONSTANT), BorderType(cv::BORDER_REFLECT), BorderType(cv::BORDER_WRAP))
+#endif

    CV_FLAGS(WarpFlags, INTER_NEAREST, INTER_LINEAR, INTER_CUBIC, WARP_INVERSE_MAP)