Revert "Merge pull request #836 from jet47:gpu-modules"

This reverts commit fba72cb60d, reversing changes made to 02131ffb62.
2013-04-18 15:03:50 +04:00
parent fba72cb60d
commit 416fb50594
472 changed files with 22945 additions and 29803 deletions
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -3,10 +3,101 @@ if(ANDROID OR IOS)
 endif()

 set(the_description "GPU-accelerated Computer Vision")
+ocv_add_module(gpu opencv_imgproc opencv_calib3d opencv_objdetect opencv_video opencv_photo opencv_legacy)

-ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4100 /wd4324 /wd4512 -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter)
+ocv_module_include_directories("${CMAKE_CURRENT_SOURCE_DIR}/src/cuda")

-ocv_define_module(gpu opencv_calib3d opencv_objdetect opencv_gpuarithm opencv_gpuwarping OPTIONAL opencv_gpulegacy)
+file(GLOB lib_hdrs               "include/opencv2/*.hpp"                       "include/opencv2/${name}/*.hpp"               "include/opencv2/${name}/*.h")
+file(GLOB lib_int_hdrs           "src/*.hpp"      "src/*.h")
+file(GLOB lib_cuda_hdrs          "src/cuda/*.hpp" "src/cuda/*.h")
+file(GLOB lib_srcs               "src/*.cpp")
+file(GLOB lib_cuda               "src/cuda/*.cu*")
+
+source_group("Include"        FILES ${lib_hdrs})
+source_group("Src\\Host"      FILES ${lib_srcs} ${lib_int_hdrs})
+source_group("Src\\Cuda"      FILES ${lib_cuda} ${lib_cuda_hdrs})
+
+if(HAVE_CUDA)
+  file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp" "src/nvidia/*.h*")
+  file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
+  set(ncv_files ${ncv_srcs} ${ncv_cuda})
+
+  source_group("Src\\NVidia" FILES ${ncv_files})
+  ocv_include_directories("src/nvidia" "src/nvidia/core" "src/nvidia/NPP_staging" ${CUDA_INCLUDE_DIRS})
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations -Wshadow -Wunused-parameter /wd4211 /wd4201 /wd4100 /wd4505 /wd4408)
+
+  if(MSVC)
+    if(NOT ENABLE_NOISY_WARNINGS)
+      foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
+        string(REPLACE "/W4" "/W3" ${var} "${${var}}")
+      endforeach()
+
+      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler /wd4251)
+    endif()
+  endif()
+
+  ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})
+
+  set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
+
+  if(WITH_NVCUVID)
+    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
+  endif()
+
+  if(WIN32)
+    find_cuda_helper_libs(nvcuvenc)
+    set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
+  endif()
+
+  if(WITH_FFMPEG)
+    set(cuda_link_libs ${cuda_link_libs} ${HIGHGUI_LIBRARIES})
+  endif()
+else()
+  set(lib_cuda "")
+  set(cuda_objs "")
+  set(cuda_link_libs "")
+  set(ncv_files "")
+endif()
+
+ocv_set_module_sources(
+  HEADERS ${lib_hdrs}
+  SOURCES ${lib_int_hdrs} ${lib_cuda_hdrs} ${lib_srcs} ${lib_cuda} ${ncv_files} ${cuda_objs}
+  )
+
+ocv_create_module(${cuda_link_libs})
+
+if(HAVE_CUDA)
+  if(HAVE_CUFFT)
+    CUDA_ADD_CUFFT_TO_TARGET(${the_module})
+  endif()
+
+  if(HAVE_CUBLAS)
+    CUDA_ADD_CUBLAS_TO_TARGET(${the_module})
+  endif()
+
+  install(FILES src/nvidia/NPP_staging/NPP_staging.hpp  src/nvidia/core/NCV.hpp
+    DESTINATION ${OPENCV_INCLUDE_INSTALL_PATH}/opencv2/${name}
+    COMPONENT main)
+endif()
+
+ocv_add_precompiled_headers(${the_module})
+
+################################################################################################################
+################################      GPU Module Tests     #####################################################
+################################################################################################################
+file(GLOB test_srcs "test/*.cpp")
+file(GLOB test_hdrs "test/*.hpp" "test/*.h")
+
+set(nvidia "")
+if(HAVE_CUDA)
+  file(GLOB nvidia "test/nvidia/*.cpp" "test/nvidia/*.hpp" "test/nvidia/*.h")
+  set(nvidia FILES "Src\\\\\\\\NVidia" ${nvidia}) # 8 ugly backslashes :'(
+endif()
+
+ocv_add_accuracy_tests(FILES "Include" ${test_hdrs}
+                       FILES "Src" ${test_srcs}
+                       ${nvidia})
+ocv_add_perf_tests()

 if(HAVE_CUDA)
  add_subdirectory(perf4au)
--- a/modules/gpu/app/nv_perf_test/CMakeLists.txt
+++ b/modules/gpu/app/nv_perf_test/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 2.8.3)
+
+project(nv_perf_test)
+
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIR})
+
+add_executable(${PROJECT_NAME} main.cpp)
+
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
--- a/modules/gpu/app/nv_perf_test/im1_1280x800.jpg
+++ b/modules/gpu/app/nv_perf_test/im1_1280x800.jpg
--- a/modules/gpu/app/nv_perf_test/im2_1280x800.jpg
+++ b/modules/gpu/app/nv_perf_test/im2_1280x800.jpg
--- a/modules/gpu/app/nv_perf_test/main.cpp
+++ b/modules/gpu/app/nv_perf_test/main.cpp
@@ -0,0 +1,486 @@
+#include <cstdio>
+#define HAVE_CUDA 1
+#include <opencv2/core.hpp>
+#include <opencv2/gpu.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/video.hpp>
+#include <opencv2/ts.hpp>
+
+static void printOsInfo()
+{
+#if defined _WIN32
+#   if defined _WIN64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Windows x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined linux
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Linux x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#elif defined __APPLE__
+#   if defined _LP64
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x64.\n[----------]\n"); fflush(stdout);
+#   else
+        printf("[----------]\n[ GPU INFO ] \tRun on OS Apple x32.\n[----------]\n"); fflush(stdout);
+#   endif
+#endif
+}
+
+static void printCudaInfo()
+{
+    const int deviceCount = cv::gpu::getCudaEnabledDeviceCount();
+
+    printf("[----------]\n"); fflush(stdout);
+    printf("[ GPU INFO ] \tCUDA device count:: %d.\n", deviceCount); fflush(stdout);
+    printf("[----------]\n"); fflush(stdout);
+
+    for (int i = 0; i < deviceCount; ++i)
+    {
+        cv::gpu::DeviceInfo info(i);
+
+        printf("[----------]\n"); fflush(stdout);
+        printf("[ DEVICE   ] \t# %d %s.\n", i, info.name().c_str()); fflush(stdout);
+        printf("[          ] \tCompute capability: %d.%d\n", info.majorVersion(), info.minorVersion()); fflush(stdout);
+        printf("[          ] \tMulti Processor Count:  %d\n", info.multiProcessorCount()); fflush(stdout);
+        printf("[          ] \tTotal memory: %d Mb\n", static_cast<int>(static_cast<int>(info.totalMemory() / 1024.0) / 1024.0)); fflush(stdout);
+        printf("[          ] \tFree  memory: %d Mb\n", static_cast<int>(static_cast<int>(info.freeMemory()  / 1024.0) / 1024.0)); fflush(stdout);
+        if (!info.isCompatible())
+            printf("[ GPU INFO ] \tThis device is NOT compatible with current GPU module build\n");
+        printf("[----------]\n"); fflush(stdout);
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    printOsInfo();
+    printCudaInfo();
+
+    perf::Regression::Init("nv_perf_test");
+    perf::TestBase::Init(argc, argv);
+    testing::InitGoogleTest(&argc, argv);
+
+    return RUN_ALL_TESTS();
+}
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+//////////////////////////////////////////////////////////
+// HoughLinesP
+
+DEF_PARAM_TEST_1(Image, std::string);
+
+GPU_PERF_TEST_P(Image, HoughLinesP, testing::Values(std::string("im1_1280x800.jpg")))
+{
+    declare.time(30.0);
+
+    std::string fileName = GetParam();
+
+    const float rho = 1.f;
+    const float theta = 1.f;
+    const int threshold = 40;
+    const int minLineLenght = 20;
+    const int maxLineGap = 5;
+
+    cv::Mat image = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_image(image);
+        cv::gpu::GpuMat d_lines;
+        cv::gpu::HoughLinesBuf d_buf;
+
+        cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::HoughLinesP(d_image, d_lines, d_buf, rho, theta, minLineLenght, maxLineGap);
+        }
+    }
+    else
+    {
+        cv::Mat mask;
+        cv::Canny(image, mask, 50, 100);
+
+        std::vector<cv::Vec4i> lines;
+        cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+
+        TEST_CYCLE()
+        {
+            cv::HoughLinesP(mask, lines, rho, theta, threshold, minLineLenght, maxLineGap);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// GoodFeaturesToTrack
+
+DEF_PARAM_TEST(Image_Depth, std::string, perf::MatDepth);
+
+GPU_PERF_TEST_P(Image_Depth, GoodFeaturesToTrack,
+                testing::Combine(
+                testing::Values(std::string("im1_1280x800.jpg")),
+                testing::Values(CV_8U, CV_16U)
+                ))
+{
+    declare.time(60);
+
+    const std::string fileName = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src = cv::imread(fileName, cv::IMREAD_GRAYSCALE);
+    if (src.empty())
+        FAIL() << "Unable to load source image [" << fileName << "]";
+
+    if (depth != CV_8U)
+        src.convertTo(src, depth);
+
+    cv::Mat mask(src.size(), CV_8UC1, cv::Scalar::all(1));
+    mask(cv::Rect(0, 0, 100, 100)).setTo(cv::Scalar::all(0));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GoodFeaturesToTrackDetector_GPU d_detector(maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, k);
+
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_pts;
+
+        d_detector(d_src, d_pts, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_detector(d_src, d_pts, d_mask);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat pts;
+
+        cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+
+        TEST_CYCLE()
+        {
+            cv::goodFeaturesToTrack(src, pts, maxCorners, qualityLevel, minDistance, mask, blockSize, useHarrisDetector, k);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowPyrLKSparse
+
+typedef std::pair<std::string, std::string> string_pair;
+
+DEF_PARAM_TEST(ImagePair_Depth_GraySource, string_pair, perf::MatDepth, bool);
+
+GPU_PERF_TEST_P(ImagePair_Depth_GraySource, OpticalFlowPyrLKSparse,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(CV_8U, CV_16U),
+                    testing::Bool()
+                    ))
+{
+    declare.time(60);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+    const bool graySource = std::tr1::get<2>(GetParam());
+
+    // PyrLK params
+    const cv::Size winSize(15, 15);
+    const int maxLevel = 5;
+    const cv::TermCriteria criteria(cv::TermCriteria::COUNT + cv::TermCriteria::EPS, 30, 0.01);
+
+    // GoodFeaturesToTrack params
+    const int maxCorners = 5000;
+    const double qualityLevel = 0.05;
+    const int minDistance = 5;
+    const int blockSize = 3;
+    const bool useHarrisDetector = true;
+    const double k = 0.05;
+
+    cv::Mat src1 = cv::imread(fileNames.first, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, graySource ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    cv::Mat gray_src;
+    if (graySource)
+        gray_src = src1;
+    else
+        cv::cvtColor(src1, gray_src, cv::COLOR_BGR2GRAY);
+
+    cv::Mat pts;
+    cv::goodFeaturesToTrack(gray_src, pts, maxCorners, qualityLevel, minDistance, cv::noArray(), blockSize, useHarrisDetector, k);
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_pts(pts.reshape(2, 1));
+        cv::gpu::GpuMat d_nextPts;
+        cv::gpu::GpuMat d_status;
+
+        cv::gpu::PyrLKOpticalFlow d_pyrLK;
+        d_pyrLK.winSize = winSize;
+        d_pyrLK.maxLevel = maxLevel;
+        d_pyrLK.iters = criteria.maxCount;
+        d_pyrLK.useInitialFlow = false;
+
+        d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+
+        TEST_CYCLE()
+        {
+            d_pyrLK.sparse(d_src1, d_src2, d_pts, d_nextPts, d_status);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat nextPts;
+        cv::Mat status;
+
+        cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+
+        TEST_CYCLE()
+        {
+            cv::calcOpticalFlowPyrLK(src1, src2, pts, nextPts, status, cv::noArray(), winSize, maxLevel, criteria);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowFarneback
+
+DEF_PARAM_TEST(ImagePair_Depth, string_pair, perf::MatDepth);
+
+GPU_PERF_TEST_P(ImagePair_Depth, OpticalFlowFarneback,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(CV_8U, CV_16U)
+                    ))
+{
+    declare.time(500);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const int depth = std::tr1::get<1>(GetParam());
+
+    const double pyrScale = 0.5;
+    const int numLevels = 6;
+    const int winSize = 7;
+    const int numIters = 15;
+    const int polyN = 7;
+    const double polySigma = 1.5;
+    const int flags = cv::OPTFLOW_USE_INITIAL_FLOW;
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (depth != CV_8U)
+    {
+        src1.convertTo(src1, depth);
+        src2.convertTo(src2, depth);
+    }
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_u(src1.size(), CV_32FC1, cv::Scalar::all(0));
+        cv::gpu::GpuMat d_v(src1.size(), CV_32FC1, cv::Scalar::all(0));
+
+        cv::gpu::FarnebackOpticalFlow d_farneback;
+        d_farneback.pyrScale = pyrScale;
+        d_farneback.numLevels = numLevels;
+        d_farneback.winSize = winSize;
+        d_farneback.numIters = numIters;
+        d_farneback.polyN = polyN;
+        d_farneback.polySigma = polySigma;
+        d_farneback.flags = flags;
+
+        d_farneback(d_src1, d_src2, d_u, d_v);
+
+        TEST_CYCLE_N(10)
+        {
+            d_farneback(d_src1, d_src2, d_u, d_v);
+        }
+    }
+    else
+    {
+        if (depth != CV_8U)
+            FAIL() << "Unsupported depth";
+
+        cv::Mat flow(src1.size(), CV_32FC2, cv::Scalar::all(0));
+
+        cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::calcOpticalFlowFarneback(src1, src2, flow, pyrScale, numLevels, winSize, numIters, polyN, polySigma, flags);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+//////////////////////////////////////////////////////////
+// OpticalFlowBM
+
+void calcOpticalFlowBM(const cv::Mat& prev, const cv::Mat& curr,
+                       cv::Size bSize, cv::Size shiftSize, cv::Size maxRange, int usePrevious,
+                       cv::Mat& velx, cv::Mat& vely)
+{
+    cv::Size sz((curr.cols - bSize.width + shiftSize.width)/shiftSize.width, (curr.rows - bSize.height + shiftSize.height)/shiftSize.height);
+
+    velx.create(sz, CV_32FC1);
+    vely.create(sz, CV_32FC1);
+
+    CvMat cvprev = prev;
+    CvMat cvcurr = curr;
+
+    CvMat cvvelx = velx;
+    CvMat cvvely = vely;
+
+    cvCalcOpticalFlowBM(&cvprev, &cvcurr, bSize, shiftSize, maxRange, usePrevious, &cvvelx, &cvvely);
+}
+
+DEF_PARAM_TEST(ImagePair_BlockSize_ShiftSize_MaxRange, string_pair, cv::Size, cv::Size, cv::Size);
+
+GPU_PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, OpticalFlowBM,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(cv::Size(16, 16)),
+                    testing::Values(cv::Size(2, 2)),
+                    testing::Values(cv::Size(16, 16))
+                    ))
+{
+    declare.time(3000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely, buf;
+
+        cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+
+        TEST_CYCLE_N(10)
+        {
+            cv::gpu::calcOpticalFlowBM(d_src1, d_src2, block_size, shift_size, max_range, false, d_velx, d_vely, buf);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
+
+GPU_PERF_TEST_P(ImagePair_BlockSize_ShiftSize_MaxRange, FastOpticalFlowBM,
+                testing::Combine(
+                    testing::Values(string_pair("im1_1280x800.jpg", "im2_1280x800.jpg")),
+                    testing::Values(cv::Size(16, 16)),
+                    testing::Values(cv::Size(1, 1)),
+                    testing::Values(cv::Size(16, 16))
+                    ))
+{
+    declare.time(3000);
+
+    const string_pair fileNames = std::tr1::get<0>(GetParam());
+    const cv::Size block_size = std::tr1::get<1>(GetParam());
+    const cv::Size shift_size = std::tr1::get<2>(GetParam());
+    const cv::Size max_range = std::tr1::get<3>(GetParam());
+
+    cv::Mat src1 = cv::imread(fileNames.first, cv::IMREAD_GRAYSCALE);
+    if (src1.empty())
+        FAIL() << "Unable to load source image [" << fileNames.first << "]";
+
+    cv::Mat src2 = cv::imread(fileNames.second, cv::IMREAD_GRAYSCALE);
+    if (src2.empty())
+        FAIL() << "Unable to load source image [" << fileNames.second << "]";
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src1(src1);
+        cv::gpu::GpuMat d_src2(src2);
+        cv::gpu::GpuMat d_velx, d_vely;
+
+        cv::gpu::FastOpticalFlowBM fastBM;
+
+        fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+
+        TEST_CYCLE_N(10)
+        {
+            fastBM(d_src1, d_src2, d_velx, d_vely, max_range.width, block_size.width);
+        }
+    }
+    else
+    {
+        cv::Mat velx, vely;
+
+        calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+
+        TEST_CYCLE_N(10)
+        {
+            calcOpticalFlowBM(src1, src2, block_size, shift_size, max_range, false, velx, vely);
+        }
+    }
+
+    SANITY_CHECK(0);
+}
--- a/modules/gpu/doc/calib3d.rst
+++ b/modules/gpu/doc/calib3d.rst
@@ -1,36 +0,0 @@
-Camera Calibration and 3D Reconstruction
-========================================
-
-.. highlight:: cpp
-
-
-
-gpu::solvePnPRansac
-------------------
-Finds the object pose from 3D-2D point correspondences.
-
-.. ocv:function:: void gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, int num_iters=100, float max_dist=8.0, int min_inlier_count=100, vector<int>* inliers=NULL)
-
-    :param object: Single-row matrix of object points.
-
-    :param image: Single-row matrix of image points.
-
-    :param camera_mat: 3x3 matrix of intrinsic camera parameters.
-
-    :param dist_coef: Distortion coefficients. See :ocv:func:`undistortPoints` for details.
-
-    :param rvec: Output 3D rotation vector.
-
-    :param tvec: Output 3D translation vector.
-
-    :param use_extrinsic_guess: Flag to indicate that the function must use ``rvec`` and ``tvec`` as an initial transformation guess. It is not supported for now.
-
-    :param num_iters: Maximum number of RANSAC iterations.
-
-    :param max_dist: Euclidean distance threshold to detect whether point is inlier or not.
-
-    :param min_inlier_count: Flag to indicate that the function must stop if greater or equal number of inliers is achieved. It is not supported for now.
-
-    :param inliers: Output vector of inlier indices.
-
-.. seealso:: :ocv:func:`solvePnPRansac`
--- a/modules/gpu/doc/camera_calibration_and_3d_reconstruction.rst
+++ b/modules/gpu/doc/camera_calibration_and_3d_reconstruction.rst
@@ -0,0 +1,499 @@
+Camera Calibration and 3D Reconstruction
+========================================
+
+.. highlight:: cpp
+
+
+
+gpu::StereoBM_GPU
+-----------------
+.. ocv:class:: gpu::StereoBM_GPU
+
+Class computing stereo correspondence (disparity map) using the block matching algorithm. ::
+
+    class StereoBM_GPU
+    {
+    public:
+        enum { BASIC_PRESET = 0, PREFILTER_XSOBEL = 1 };
+
+        enum { DEFAULT_NDISP = 64, DEFAULT_WINSZ = 19 };
+
+        StereoBM_GPU();
+        StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP,
+                     int winSize = DEFAULT_WINSZ);
+
+        void operator() (const GpuMat& left, const GpuMat& right,
+                         GpuMat& disparity, Stream& stream = Stream::Null());
+
+        static bool checkIfGpuCallReasonable();
+
+        int preset;
+        int ndisp;
+        int winSize;
+
+        float avergeTexThreshold;
+
+        ...
+    };
+
+
+The class also performs pre- and post-filtering steps: Sobel pre-filtering (if ``PREFILTER_XSOBEL`` flag is set) and low textureness filtering (if ``averageTexThreshols > 0`` ). If ``avergeTexThreshold = 0`` , low textureness filtering is disabled. Otherwise, the disparity is set to 0 in each point ``(x, y)`` , where for the left image
+
+.. math::
+    \sum HorizontalGradiensInWindow(x, y, winSize) < (winSize \cdot winSize) \cdot avergeTexThreshold
+
+This means that the input left image is low textured.
+
+
+
+gpu::StereoBM_GPU::StereoBM_GPU
+-----------------------------------
+Enables :ocv:class:`gpu::StereoBM_GPU` constructors.
+
+.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU()
+
+.. ocv:function:: gpu::StereoBM_GPU::StereoBM_GPU(int preset, int ndisparities = DEFAULT_NDISP, int winSize = DEFAULT_WINSZ)
+
+    :param preset: Parameter presetting:
+
+        * **BASIC_PRESET** Basic mode without pre-processing.
+
+        * **PREFILTER_XSOBEL** Sobel pre-filtering mode.
+
+    :param ndisparities: Number of disparities. It must be a multiple of 8 and less or equal to 256.
+
+    :param winSize: Block size.
+
+
+
+gpu::StereoBM_GPU::operator ()
+----------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
+
+.. ocv:function:: void gpu::StereoBM_GPU::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
+
+    :param left: Left image. Only  ``CV_8UC1``  type is supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param disparity: Output disparity map. It is a  ``CV_8UC1``  image with the same size as the input images.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::StereoBM_GPU::checkIfGpuCallReasonable
+-----------------------------------------------
+Uses a heuristic method to estimate whether the current GPU is faster than the CPU in this algorithm. It queries the currently active device.
+
+.. ocv:function:: bool gpu::StereoBM_GPU::checkIfGpuCallReasonable()
+
+
+
+gpu::StereoBeliefPropagation
+----------------------------
+.. ocv:class:: gpu::StereoBeliefPropagation
+
+Class computing stereo correspondence using the belief propagation algorithm. ::
+
+    class StereoBeliefPropagation
+    {
+    public:
+        enum { DEFAULT_NDISP  = 64 };
+        enum { DEFAULT_ITERS  = 5  };
+        enum { DEFAULT_LEVELS = 5  };
+
+        static void estimateRecommendedParams(int width, int height,
+            int& ndisp, int& iters, int& levels);
+
+        explicit StereoBeliefPropagation(int ndisp = DEFAULT_NDISP,
+            int iters  = DEFAULT_ITERS,
+            int levels = DEFAULT_LEVELS,
+            int msg_type = CV_32F);
+        StereoBeliefPropagation(int ndisp, int iters, int levels,
+            float max_data_term, float data_weight,
+            float max_disc_term, float disc_single_jump,
+            int msg_type = CV_32F);
+
+        void operator()(const GpuMat& left, const GpuMat& right,
+                        GpuMat& disparity, Stream& stream = Stream::Null());
+        void operator()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null());
+
+        int ndisp;
+
+        int iters;
+        int levels;
+
+        float max_data_term;
+        float data_weight;
+        float max_disc_term;
+        float disc_single_jump;
+
+        int msg_type;
+
+        ...
+    };
+
+The class implements algorithm described in [Felzenszwalb2006]_ . It can compute own data cost (using a truncated linear model) or use a user-provided data cost.
+
+.. note::
+
+    ``StereoBeliefPropagation`` requires a lot of memory for message storage:
+
+    .. math::
+
+        width \_ step  \cdot height  \cdot ndisp  \cdot 4  \cdot (1 + 0.25)
+
+    and for data cost storage:
+
+    .. math::
+
+        width\_step \cdot height \cdot ndisp \cdot (1 + 0.25 + 0.0625 +  \dotsm + \frac{1}{4^{levels}})
+
+    ``width_step`` is the number of bytes in a line including padding.
+
+
+
+gpu::StereoBeliefPropagation::StereoBeliefPropagation
+---------------------------------------------------------
+Enables the :ocv:class:`gpu::StereoBeliefPropagation` constructors.
+
+.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int msg_type = CV_32F)
+
+.. ocv:function:: gpu::StereoBeliefPropagation::StereoBeliefPropagation(int ndisp, int iters, int levels, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param max_data_term: Threshold for data cost truncation.
+
+    :param data_weight: Data weight.
+
+    :param max_disc_term: Threshold for discontinuity truncation.
+
+    :param disc_single_jump: Discontinuity single jump.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+``StereoBeliefPropagation`` uses a truncated linear model for the data cost and discontinuity terms:
+
+.. math::
+
+    DataCost = data \_ weight  \cdot \min ( \lvert Img_Left(x,y)-Img_Right(x-d,y)  \rvert , max \_ data \_ term)
+
+.. math::
+
+    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
+
+For more details, see [Felzenszwalb2006]_.
+
+By default, :ocv:class:`gpu::StereoBeliefPropagation` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+
+.. math::
+
+    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
+
+
+
+gpu::StereoBeliefPropagation::estimateRecommendedParams
+-----------------------------------------------------------
+Uses a heuristic method to compute the recommended parameters ( ``ndisp``, ``iters`` and ``levels`` ) for the specified image size ( ``width`` and ``height`` ).
+
+.. ocv:function:: void gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
+
+
+
+gpu::StereoBeliefPropagation::operator ()
+---------------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair or data cost.
+
+.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::StereoBeliefPropagation::operator ()(const GpuMat& data, GpuMat& disparity, Stream& stream = Stream::Null())
+
+    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param data: User-specified data cost, a matrix of ``msg_type`` type and ``Size(<image columns>*ndisp, <image rows>)`` size.
+
+    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the type is retained.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::StereoConstantSpaceBP
+--------------------------
+.. ocv:class:: gpu::StereoConstantSpaceBP
+
+Class computing stereo correspondence using the constant space belief propagation algorithm. ::
+
+    class StereoConstantSpaceBP
+    {
+    public:
+        enum { DEFAULT_NDISP    = 128 };
+        enum { DEFAULT_ITERS    = 8   };
+        enum { DEFAULT_LEVELS   = 4   };
+        enum { DEFAULT_NR_PLANE = 4   };
+
+        static void estimateRecommendedParams(int width, int height,
+            int& ndisp, int& iters, int& levels, int& nr_plane);
+
+        explicit StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP,
+            int iters    = DEFAULT_ITERS,
+            int levels   = DEFAULT_LEVELS,
+            int nr_plane = DEFAULT_NR_PLANE,
+            int msg_type = CV_32F);
+        StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane,
+            float max_data_term, float data_weight,
+            float max_disc_term, float disc_single_jump,
+            int min_disp_th = 0,
+            int msg_type = CV_32F);
+
+        void operator()(const GpuMat& left, const GpuMat& right,
+                        GpuMat& disparity, Stream& stream = Stream::Null());
+
+        int ndisp;
+
+        int iters;
+        int levels;
+
+        int nr_plane;
+
+        float max_data_term;
+        float data_weight;
+        float max_disc_term;
+        float disc_single_jump;
+
+        int min_disp_th;
+
+        int msg_type;
+
+        bool use_local_init_data_cost;
+
+        ...
+    };
+
+
+The class implements algorithm described in [Yang2010]_. ``StereoConstantSpaceBP`` supports both local minimum and global minimum data cost initialization algorithms. For more details, see the paper mentioned above. By default, a local algorithm is used. To enable a global algorithm, set ``use_local_init_data_cost`` to ``false`` .
+
+
+
+gpu::StereoConstantSpaceBP::StereoConstantSpaceBP
+-----------------------------------------------------
+Enables the :ocv:class:`gpu::StereoConstantSpaceBP` constructors.
+
+.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp = DEFAULT_NDISP, int iters = DEFAULT_ITERS, int levels = DEFAULT_LEVELS, int nr_plane = DEFAULT_NR_PLANE, int msg_type = CV_32F)
+
+.. ocv:function:: gpu::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp, int iters, int levels, int nr_plane, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th = 0, int msg_type = CV_32F)
+
+    :param ndisp: Number of disparities.
+
+    :param iters: Number of BP iterations on each level.
+
+    :param levels: Number of levels.
+
+    :param nr_plane: Number of disparity levels on the first level.
+
+    :param max_data_term: Truncation of data cost.
+
+    :param data_weight: Data weight.
+
+    :param max_disc_term: Truncation of discontinuity.
+
+    :param disc_single_jump: Discontinuity single jump.
+
+    :param min_disp_th: Minimal disparity threshold.
+
+    :param msg_type: Type for messages.  ``CV_16SC1``  and  ``CV_32FC1`` types are supported.
+
+``StereoConstantSpaceBP`` uses a truncated linear model for the data cost and discontinuity terms:
+
+.. math::
+
+    DataCost = data \_ weight  \cdot \min ( \lvert I_2-I_1  \rvert , max \_ data \_ term)
+
+.. math::
+
+    DiscTerm =  \min (disc \_ single \_ jump  \cdot \lvert f_1-f_2  \rvert , max \_ disc \_ term)
+
+For more details, see [Yang2010]_.
+
+By default, ``StereoConstantSpaceBP`` uses floating-point arithmetics and the ``CV_32FC1`` type for messages. But it can also use fixed-point arithmetics and the ``CV_16SC1`` message type for better performance. To avoid an overflow in this case, the parameters must satisfy the following requirement:
+
+.. math::
+
+    10  \cdot 2^{levels-1}  \cdot max \_ data \_ term < SHRT \_ MAX
+
+
+
+gpu::StereoConstantSpaceBP::estimateRecommendedParams
+---------------------------------------------------------
+Uses a heuristic method to compute parameters (ndisp, iters, levelsand nrplane) for the specified image size (widthand height).
+
+.. ocv:function:: void gpu::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels, int& nr_plane)
+
+
+
+gpu::StereoConstantSpaceBP::operator ()
+-------------------------------------------
+Enables the stereo correspondence operator that finds the disparity for the specified rectified stereo pair.
+
+.. ocv:function:: void gpu::StereoConstantSpaceBP::operator ()(const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream = Stream::Null())
+
+    :param left: Left image. ``CV_8UC1`` , ``CV_8UC3``  and  ``CV_8UC4``  types are supported.
+
+    :param right: Right image with the same size and the same type as the left one.
+
+    :param disparity: Output disparity map. If  ``disparity``  is empty, the output type is  ``CV_16SC1`` . Otherwise, the output type is  ``disparity.type()`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::DisparityBilateralFilter
+-----------------------------
+.. ocv:class:: gpu::DisparityBilateralFilter
+
+Class refining a disparity map using joint bilateral filtering. ::
+
+    class CV_EXPORTS DisparityBilateralFilter
+    {
+    public:
+        enum { DEFAULT_NDISP  = 64 };
+        enum { DEFAULT_RADIUS = 3 };
+        enum { DEFAULT_ITERS  = 1 };
+
+        explicit DisparityBilateralFilter(int ndisp = DEFAULT_NDISP,
+            int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS);
+
+        DisparityBilateralFilter(int ndisp, int radius, int iters,
+            float edge_threshold, float max_disc_threshold,
+            float sigma_range);
+
+        void operator()(const GpuMat& disparity, const GpuMat& image,
+                        GpuMat& dst, Stream& stream = Stream::Null());
+
+        ...
+    };
+
+
+The class implements [Yang2010]_ algorithm.
+
+
+
+gpu::DisparityBilateralFilter::DisparityBilateralFilter
+-----------------------------------------------------------
+Enables the :ocv:class:`gpu::DisparityBilateralFilter` constructors.
+
+.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp = DEFAULT_NDISP, int radius = DEFAULT_RADIUS, int iters = DEFAULT_ITERS)
+
+.. ocv:function:: gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, float sigma_range)
+
+    :param ndisp: Number of disparities.
+
+    :param radius: Filter radius.
+
+    :param iters: Number of iterations.
+
+    :param edge_threshold: Threshold for edges.
+
+    :param max_disc_threshold: Constant to reject outliers.
+
+    :param sigma_range: Filter range.
+
+
+
+gpu::DisparityBilateralFilter::operator ()
+----------------------------------------------
+Refines a disparity map using joint bilateral filtering.
+
+.. ocv:function:: void gpu::DisparityBilateralFilter::operator ()(const GpuMat& disparity, const GpuMat& image, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param disparity: Input disparity map.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
+
+    :param image: Input image. ``CV_8UC1``  and  ``CV_8UC3``  types are supported.
+
+    :param dst: Destination disparity map. It has the same size and type as  ``disparity`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::drawColorDisp
+----------------------
+Colors a disparity image.
+
+.. ocv:function:: void gpu::drawColorDisp(const GpuMat& src_disp, GpuMat& dst_disp, int ndisp, Stream& stream = Stream::Null())
+
+    :param src_disp: Source disparity image.  ``CV_8UC1``  and  ``CV_16SC1``  types are supported.
+
+    :param dst_disp: Output disparity image. It has the same size as  ``src_disp`` . The  type is ``CV_8UC4``  in  ``BGRA``  format (alpha = 255).
+
+    :param ndisp: Number of disparities.
+
+    :param stream: Stream for the asynchronous version.
+
+This function draws a colored disparity map by converting disparity values from ``[0..ndisp)`` interval first to ``HSV`` color space (where different disparity values correspond to different hues) and then converting the pixels to ``RGB`` for visualization.
+
+
+
+gpu::reprojectImageTo3D
+---------------------------
+Reprojects a disparity image to 3D space.
+
+.. ocv:function:: void gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, int dst_cn = 4, Stream& stream = Stream::Null())
+
+    :param disp: Input disparity image.  ``CV_8U``  and  ``CV_16S``  types are supported.
+
+    :param xyzw: Output 3- or 4-channel floating-point image of the same size as  ``disp`` . Each element of  ``xyzw(x,y)``  contains 3D coordinates ``(x,y,z)`` or ``(x,y,z,1)``  of the point  ``(x,y)`` , computed from the disparity map.
+
+    :param Q: :math:`4 \times 4`  perspective transformation matrix that can be obtained via  :ocv:func:`stereoRectify` .
+
+    :param dst_cn: The number of channels for output image. Can be 3 or 4.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`reprojectImageTo3D`
+
+
+
+gpu::solvePnPRansac
+-------------------
+Finds the object pose from 3D-2D point correspondences.
+
+.. ocv:function:: void gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat, const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess=false, int num_iters=100, float max_dist=8.0, int min_inlier_count=100, vector<int>* inliers=NULL)
+
+    :param object: Single-row matrix of object points.
+
+    :param image: Single-row matrix of image points.
+
+    :param camera_mat: 3x3 matrix of intrinsic camera parameters.
+
+    :param dist_coef: Distortion coefficients. See :ocv:func:`undistortPoints` for details.
+
+    :param rvec: Output 3D rotation vector.
+
+    :param tvec: Output 3D translation vector.
+
+    :param use_extrinsic_guess: Flag to indicate that the function must use ``rvec`` and ``tvec`` as an initial transformation guess. It is not supported for now.
+
+    :param num_iters: Maximum number of RANSAC iterations.
+
+    :param max_dist: Euclidean distance threshold to detect whether point is inlier or not.
+
+    :param min_inlier_count: Flag to indicate that the function must stop if greater or equal number of inliers is achieved. It is not supported for now.
+
+    :param inliers: Output vector of inlier indices.
+
+.. seealso:: :ocv:func:`solvePnPRansac`
+
+
+
+.. [Felzenszwalb2006] Pedro F. Felzenszwalb algorithm [Pedro F. Felzenszwalb and Daniel P. Huttenlocher. *Efficient belief propagation for early vision*. International Journal of Computer Vision, 70(1), October 2006
+
+.. [Yang2010] Q. Yang, L. Wang, and N. Ahuja. *A constant-space belief propagation algorithm for stereo matching*. In CVPR, 2010.
--- a/modules/gpu/doc/feature_detection_and_description.rst
+++ b/modules/gpu/doc/feature_detection_and_description.rst
@@ -0,0 +1,539 @@
+Feature Detection and Description
+=================================
+
+.. highlight:: cpp
+
+
+
+gpu::FAST_GPU
+-------------
+.. ocv:class:: gpu::FAST_GPU
+
+Class used for corner detection using the FAST algorithm. ::
+
+    class FAST_GPU
+    {
+    public:
+        enum
+        {
+            LOCATION_ROW = 0,
+            RESPONSE_ROW,
+            ROWS_COUNT
+        };
+
+        // all features have same size
+        static const int FEATURE_SIZE = 7;
+
+        explicit FAST_GPU(int threshold, bool nonmaxSupression = true,
+                          double keypointsRatio = 0.05);
+
+        void operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
+        void operator ()(const GpuMat& image, const GpuMat& mask,
+                         std::vector<KeyPoint>& keypoints);
+
+        void downloadKeypoints(const GpuMat& d_keypoints,
+                               std::vector<KeyPoint>& keypoints);
+
+        void convertKeypoints(const Mat& h_keypoints,
+                              std::vector<KeyPoint>& keypoints);
+
+        void release();
+
+        bool nonmaxSupression;
+
+        int threshold;
+
+        double keypointsRatio;
+
+        int calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask);
+
+        int getKeyPoints(GpuMat& keypoints);
+    };
+
+
+The class ``FAST_GPU`` implements FAST corner detection algorithm.
+
+.. seealso:: :ocv:func:`FAST`
+
+
+
+gpu::FAST_GPU::FAST_GPU
+-------------------------------------
+Constructor.
+
+.. ocv:function:: gpu::FAST_GPU::FAST_GPU(int threshold, bool nonmaxSupression = true, double keypointsRatio = 0.05)
+
+    :param threshold: Threshold on difference between intensity of the central pixel and pixels on a circle around this pixel.
+
+    :param nonmaxSupression: If it is true, non-maximum suppression is applied to detected corners (keypoints).
+
+    :param keypointsRatio: Inner buffer size for keypoints store is determined as (keypointsRatio * image_width * image_height).
+
+
+
+gpu::FAST_GPU::operator ()
+-------------------------------------
+Finds the keypoints using FAST detector.
+
+.. ocv:function:: void gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
+.. ocv:function:: void gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
+
+    :param image: Image where keypoints (corners) are detected. Only 8-bit grayscale images are supported.
+
+    :param mask: Optional input mask that marks the regions where we should detect features.
+
+    :param keypoints: The output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU memory:
+
+            * keypoints.ptr<Vec2s>(LOCATION_ROW)[i] will contain location of i'th point
+            * keypoints.ptr<float>(RESPONSE_ROW)[i] will contain response of i'th point (if non-maximum suppression is applied)
+
+
+
+gpu::FAST_GPU::downloadKeypoints
+-------------------------------------
+Download keypoints from GPU to CPU memory.
+
+.. ocv:function:: void gpu::FAST_GPU::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
+
+
+
+gpu::FAST_GPU::convertKeypoints
+-------------------------------------
+Converts keypoints from GPU representation to vector of ``KeyPoint``.
+
+.. ocv:function:: void gpu::FAST_GPU::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
+
+
+
+gpu::FAST_GPU::release
+-------------------------------------
+Releases inner buffer memory.
+
+.. ocv:function:: void gpu::FAST_GPU::release()
+
+
+
+gpu::FAST_GPU::calcKeyPointsLocation
+-------------------------------------
+Find keypoints and compute it's response if ``nonmaxSupression`` is true.
+
+.. ocv:function:: int gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& image, const GpuMat& mask)
+
+    :param image: Image where keypoints (corners) are detected. Only 8-bit grayscale images are supported.
+
+    :param mask: Optional input mask that marks the regions where we should detect features.
+
+The function returns count of detected keypoints.
+
+
+
+gpu::FAST_GPU::getKeyPoints
+-------------------------------------
+Gets final array of keypoints.
+
+.. ocv:function:: int gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
+
+    :param keypoints: The output vector of keypoints.
+
+The function performs non-max suppression if needed and returns final count of keypoints.
+
+
+
+gpu::ORB_GPU
+-------------
+.. ocv:class:: gpu::ORB_GPU
+
+Class for extracting ORB features and descriptors from an image. ::
+
+    class ORB_GPU
+    {
+    public:
+        enum
+        {
+            X_ROW = 0,
+            Y_ROW,
+            RESPONSE_ROW,
+            ANGLE_ROW,
+            OCTAVE_ROW,
+            SIZE_ROW,
+            ROWS_COUNT
+        };
+
+        enum
+        {
+            DEFAULT_FAST_THRESHOLD = 20
+        };
+
+        explicit ORB_GPU(int nFeatures = 500, float scaleFactor = 1.2f,
+                         int nLevels = 8, int edgeThreshold = 31,
+                         int firstLevel = 0, int WTA_K = 2,
+                         int scoreType = 0, int patchSize = 31);
+
+        void operator()(const GpuMat& image, const GpuMat& mask,
+                        std::vector<KeyPoint>& keypoints);
+        void operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints);
+
+        void operator()(const GpuMat& image, const GpuMat& mask,
+                        std::vector<KeyPoint>& keypoints, GpuMat& descriptors);
+        void operator()(const GpuMat& image, const GpuMat& mask,
+                        GpuMat& keypoints, GpuMat& descriptors);
+
+        void downloadKeyPoints(GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints);
+
+        void convertKeyPoints(Mat& d_keypoints, std::vector<KeyPoint>& keypoints);
+
+        int descriptorSize() const;
+
+        void setParams(size_t n_features, const ORB::CommonParams& detector_params);
+        void setFastParams(int threshold, bool nonmaxSupression = true);
+
+        void release();
+
+        bool blurForDescriptor;
+    };
+
+The class implements ORB feature detection and description algorithm.
+
+
+
+gpu::ORB_GPU::ORB_GPU
+-------------------------------------
+Constructor.
+
+.. ocv:function:: gpu::ORB_GPU::ORB_GPU(int nFeatures = 500, float scaleFactor = 1.2f, int nLevels = 8, int edgeThreshold = 31, int firstLevel = 0, int WTA_K = 2, int scoreType = 0, int patchSize = 31)
+
+    :param nFeatures: The number of desired features.
+
+    :param scaleFactor: Coefficient by which we divide the dimensions from one scale pyramid level to the next.
+
+    :param nLevels: The number of levels in the scale pyramid.
+
+    :param edgeThreshold: How far from the boundary the points should be.
+
+    :param firstLevel: The level at which the image is given. If 1, that means we will also look at the image  `scaleFactor`  times bigger.
+
+
+
+gpu::ORB_GPU::operator()
+-------------------------------------
+Detects keypoints and computes descriptors for them.
+
+.. ocv:function:: void gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
+
+.. ocv:function:: void gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints)
+
+.. ocv:function:: void gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints, GpuMat& descriptors)
+
+.. ocv:function:: void gpu::ORB_GPU::operator()(const GpuMat& image, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors)
+
+    :param image: Input 8-bit grayscale image.
+
+    :param mask: Optional input mask that marks the regions where we should detect features.
+
+    :param keypoints: The input/output vector of keypoints. Can be stored both in CPU and GPU memory. For GPU memory:
+
+            * ``keypoints.ptr<float>(X_ROW)[i]`` contains x coordinate of the i'th feature.
+            * ``keypoints.ptr<float>(Y_ROW)[i]`` contains y coordinate of the i'th feature.
+            * ``keypoints.ptr<float>(RESPONSE_ROW)[i]`` contains the response of the i'th feature.
+            * ``keypoints.ptr<float>(ANGLE_ROW)[i]`` contains orientation of the i'th feature.
+            * ``keypoints.ptr<float>(OCTAVE_ROW)[i]`` contains the octave of the i'th feature.
+            * ``keypoints.ptr<float>(SIZE_ROW)[i]`` contains the size of the i'th feature.
+
+    :param descriptors: Computed descriptors. if ``blurForDescriptor`` is true, image will be blurred before descriptors calculation.
+
+
+
+gpu::ORB_GPU::downloadKeyPoints
+-------------------------------------
+Download keypoints from GPU to CPU memory.
+
+.. ocv:function:: static void gpu::ORB_GPU::downloadKeyPoints( const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints )
+
+
+
+gpu::ORB_GPU::convertKeyPoints
+-------------------------------------
+Converts keypoints from GPU representation to vector of ``KeyPoint``.
+
+.. ocv:function:: static void gpu::ORB_GPU::convertKeyPoints( const Mat& d_keypoints, std::vector<KeyPoint>& keypoints )
+
+
+
+gpu::ORB_GPU::release
+-------------------------------------
+Releases inner buffer memory.
+
+.. ocv:function:: void gpu::ORB_GPU::release()
+
+
+
+gpu::BFMatcher_GPU
+--------------------------
+.. ocv:class:: gpu::BFMatcher_GPU
+
+Brute-force descriptor matcher. For each descriptor in the first set, this matcher finds the closest descriptor in the second set by trying each one. This descriptor matcher supports masking permissible matches between descriptor sets. ::
+
+    class BFMatcher_GPU
+    {
+    public:
+        explicit BFMatcher_GPU(int norm = cv::NORM_L2);
+
+        // Add descriptors to train descriptor collection.
+        void add(const std::vector<GpuMat>& descCollection);
+
+        // Get train descriptors collection.
+        const std::vector<GpuMat>& getTrainDescriptors() const;
+
+        // Clear train descriptors collection.
+        void clear();
+
+        // Return true if there are no train descriptors in collection.
+        bool empty() const;
+
+        // Return true if the matcher supports mask in match methods.
+        bool isMaskSupported() const;
+
+        void matchSingle(const GpuMat& query, const GpuMat& train,
+            GpuMat& trainIdx, GpuMat& distance,
+            const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+
+        static void matchDownload(const GpuMat& trainIdx,
+            const GpuMat& distance, std::vector<DMatch>& matches);
+        static void matchConvert(const Mat& trainIdx,
+            const Mat& distance, std::vector<DMatch>& matches);
+
+        void match(const GpuMat& query, const GpuMat& train,
+            std::vector<DMatch>& matches, const GpuMat& mask = GpuMat());
+
+        void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
+            const vector<GpuMat>& masks = std::vector<GpuMat>());
+
+        void matchCollection(const GpuMat& query, const GpuMat& trainCollection,
+            GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
+            const GpuMat& maskCollection, Stream& stream = Stream::Null());
+
+        static void matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx,
+            const GpuMat& distance, std::vector<DMatch>& matches);
+        static void matchConvert(const Mat& trainIdx, const Mat& imgIdx,
+            const Mat& distance, std::vector<DMatch>& matches);
+
+        void match(const GpuMat& query, std::vector<DMatch>& matches,
+            const std::vector<GpuMat>& masks = std::vector<GpuMat>());
+
+        void knnMatchSingle(const GpuMat& query, const GpuMat& train,
+            GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k,
+            const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+
+        static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+        static void knnMatchConvert(const Mat& trainIdx, const Mat& distance,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+
+        void knnMatch(const GpuMat& query, const GpuMat& train,
+            std::vector< std::vector<DMatch> >& matches, int k,
+            const GpuMat& mask = GpuMat(), bool compactResult = false);
+
+        void knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection,
+            GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
+            const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null());
+
+        static void knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+        static void knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+
+        void knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, int k,
+            const std::vector<GpuMat>& masks = std::vector<GpuMat>(),
+            bool compactResult = false);
+
+        void radiusMatchSingle(const GpuMat& query, const GpuMat& train,
+            GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
+            const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null());
+
+        static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+        static void radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+
+        void radiusMatch(const GpuMat& query, const GpuMat& train,
+            std::vector< std::vector<DMatch> >& matches, float maxDistance,
+            const GpuMat& mask = GpuMat(), bool compactResult = false);
+
+        void radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance,
+            const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null());
+
+        static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+        static void radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches,
+            std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
+
+        void radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >& matches, float maxDistance,
+            const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
+
+    private:
+        std::vector<GpuMat> trainDescCollection;
+    };
+
+
+The class ``BFMatcher_GPU`` has an interface similar to the class :ocv:class:`DescriptorMatcher`. It has two groups of ``match`` methods: for matching descriptors of one image with another image or with an image set. Also, all functions have an alternative to save results either to the GPU memory or to the CPU memory.
+
+.. seealso:: :ocv:class:`DescriptorMatcher`, :ocv:class:`BFMatcher`
+
+
+
+gpu::BFMatcher_GPU::match
+-------------------------------------
+Finds the best match for each descriptor from a query set with train descriptors.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::match(const GpuMat& query, const GpuMat& train, std::vector<DMatch>& matches, const GpuMat& mask = GpuMat())
+
+.. ocv:function:: void gpu::BFMatcher_GPU::matchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::BFMatcher_GPU::match(const GpuMat& query, std::vector<DMatch>& matches, const std::vector<GpuMat>& masks = std::vector<GpuMat>())
+
+.. ocv:function:: void gpu::BFMatcher_GPU::matchCollection( const GpuMat& query, const GpuMat& trainCollection, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& masks=GpuMat(), Stream& stream=Stream::Null() )
+
+.. seealso:: :ocv:func:`DescriptorMatcher::match`
+
+
+
+gpu::BFMatcher_GPU::makeGpuCollection
+-------------------------------------------------
+Performs a GPU collection of train descriptors and masks in a suitable format for the :ocv:func:`gpu::BFMatcher_GPU::matchCollection` function.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection, const vector<GpuMat>& masks = std::vector<GpuMat>())
+
+
+
+gpu::BFMatcher_GPU::matchDownload
+---------------------------------------------
+Downloads matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::matchSingle` or :ocv:func:`gpu::BFMatcher_GPU::matchCollection` to vector with :ocv:class:`DMatch`.
+
+.. ocv:function:: static void gpu::BFMatcher_GPU::matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>&matches)
+
+.. ocv:function:: static void gpu::BFMatcher_GPU::matchDownload( const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector<DMatch>& matches )
+
+
+
+gpu::BFMatcher_GPU::matchConvert
+---------------------------------------------
+Converts matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::matchSingle` or :ocv:func:`gpu::BFMatcher_GPU::matchCollection` to vector with :ocv:class:`DMatch`.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& distance, std::vector<DMatch>&matches)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::matchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector<DMatch>&matches)
+
+
+
+gpu::BFMatcher_GPU::knnMatch
+----------------------------------------
+Finds the ``k`` best matches for each descriptor from a query set with train descriptors.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatch(const GpuMat& query, const GpuMat& train, std::vector< std::vector<DMatch> >&matches, int k, const GpuMat& mask = GpuMat(), bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatch(const GpuMat& query, std::vector< std::vector<DMatch> >&matches, int k, const std::vector<GpuMat>&masks = std::vector<GpuMat>(), bool compactResult = false )
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatch2Collection(const GpuMat& query, const GpuMat& trainCollection, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection = GpuMat(), Stream& stream = Stream::Null())
+
+    :param query: Query set of descriptors.
+
+    :param train: Training set of descriptors. It is not be added to train descriptors collection stored in the class object.
+
+    :param k: Number of the best matches per each query descriptor (or less if it is not possible).
+
+    :param mask: Mask specifying permissible matches between the input query and train matrices of descriptors.
+
+    :param compactResult: If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
+
+    :param stream: Stream for the asynchronous version.
+
+The function returns detected ``k`` (or less if not possible) matches in the increasing order by distance.
+
+The third variant of the method stores the results in GPU memory.
+
+.. seealso:: :ocv:func:`DescriptorMatcher::knnMatch`
+
+
+
+gpu::BFMatcher_GPU::knnMatchDownload
+------------------------------------------------
+Downloads matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::knnMatchSingle` or :ocv:func:`gpu::BFMatcher_GPU::knnMatch2Collection` to vector with :ocv:class:`DMatch`.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatch2Download(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
+
+If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
+
+
+
+gpu::BFMatcher_GPU::knnMatchConvert
+------------------------------------------------
+Converts matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::knnMatchSingle` or :ocv:func:`gpu::BFMatcher_GPU::knnMatch2Collection` to CPU vector with :ocv:class:`DMatch`.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatchConvert(const Mat& trainIdx, const Mat& distance, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::knnMatch2Convert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
+
+If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
+
+
+
+gpu::BFMatcher_GPU::radiusMatch
+-------------------------------------------
+For each query descriptor, finds the best matches with a distance less than a given threshold.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatch(const GpuMat& query, const GpuMat& train, std::vector< std::vector<DMatch> >&matches, float maxDistance, const GpuMat& mask = GpuMat(), bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchSingle(const GpuMat& query, const GpuMat& train, GpuMat& trainIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const GpuMat& mask = GpuMat(), Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatch(const GpuMat& query, std::vector< std::vector<DMatch> >&matches, float maxDistance, const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchCollection(const GpuMat& query, GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, GpuMat& nMatches, float maxDistance, const std::vector<GpuMat>& masks = std::vector<GpuMat>(), Stream& stream = Stream::Null())
+
+    :param query: Query set of descriptors.
+
+    :param train: Training set of descriptors. It is not added to train descriptors collection stored in the class object.
+
+    :param maxDistance: Distance threshold.
+
+    :param mask: Mask specifying permissible matches between the input query and train matrices of descriptors.
+
+    :param compactResult: If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
+
+    :param stream: Stream for the asynchronous version.
+
+The function returns detected matches in the increasing order by distance.
+
+The methods work only on devices with the compute capability  :math:`>=` 1.1.
+
+The third variant of the method stores the results in GPU memory and does not store the points by the distance.
+
+.. seealso:: :ocv:func:`DescriptorMatcher::radiusMatch`
+
+
+
+gpu::BFMatcher_GPU::radiusMatchDownload
+---------------------------------------------------
+Downloads matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::radiusMatchSingle` or :ocv:func:`gpu::BFMatcher_GPU::radiusMatchCollection` to vector with :ocv:class:`DMatch`.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& distance, const GpuMat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& imgIdx, const GpuMat& distance, const GpuMat& nMatches, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
+
+If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
+
+
+
+
+gpu::BFMatcher_GPU::radiusMatchConvert
+---------------------------------------------------
+Converts matrices obtained via :ocv:func:`gpu::BFMatcher_GPU::radiusMatchSingle` or :ocv:func:`gpu::BFMatcher_GPU::radiusMatchCollection` to vector with :ocv:class:`DMatch`.
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >&matches, bool compactResult = false)
+
+.. ocv:function:: void gpu::BFMatcher_GPU::radiusMatchConvert(const Mat& trainIdx, const Mat& imgIdx, const Mat& distance, const Mat& nMatches, std::vector< std::vector<DMatch> >& matches, bool compactResult = false)
+
+If ``compactResult`` is ``true`` , the ``matches`` vector does not contain matches for fully masked-out query descriptors.
--- a/modules/gpu/doc/gpu.rst
+++ b/modules/gpu/doc/gpu.rst
@@ -8,5 +8,12 @@ gpu. GPU-accelerated Computer Vision
    introduction
    initalization_and_information
    data_structures
+    operations_on_matrices
+    per_element_operations
+    image_processing
+    matrix_reductions
    object_detection
-    calib3d
+    feature_detection_and_description
+    image_filtering
+    camera_calibration_and_3d_reconstruction
+    video
--- a/modules/gpu/doc/image_filtering.rst
+++ b/modules/gpu/doc/image_filtering.rst
@@ -0,0 +1,719 @@
+Image Filtering
+===============
+
+.. highlight:: cpp
+
+Functions and classes described in this section are used to perform various linear or non-linear filtering operations on 2D images.
+
+
+
+gpu::BaseRowFilter_GPU
+----------------------
+.. ocv:class:: gpu::BaseRowFilter_GPU
+
+Base class for linear or non-linear filters that processes rows of 2D arrays. Such filters are used for the "horizontal" filtering passes in separable filters. ::
+
+    class BaseRowFilter_GPU
+    {
+    public:
+        BaseRowFilter_GPU(int ksize_, int anchor_);
+        virtual ~BaseRowFilter_GPU() {}
+        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+        int ksize, anchor;
+    };
+
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
+
+
+
+gpu::BaseColumnFilter_GPU
+-------------------------
+.. ocv:class:: gpu::BaseColumnFilter_GPU
+
+Base class for linear or non-linear filters that processes columns of 2D arrays. Such filters are used for the "vertical" filtering passes in separable filters. ::
+
+    class BaseColumnFilter_GPU
+    {
+    public:
+        BaseColumnFilter_GPU(int ksize_, int anchor_);
+        virtual ~BaseColumnFilter_GPU() {}
+        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+        int ksize, anchor;
+    };
+
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
+
+
+
+gpu::BaseFilter_GPU
+-------------------
+.. ocv:class:: gpu::BaseFilter_GPU
+
+Base class for non-separable 2D filters. ::
+
+    class CV_EXPORTS BaseFilter_GPU
+    {
+    public:
+        BaseFilter_GPU(const Size& ksize_, const Point& anchor_);
+        virtual ~BaseFilter_GPU() {}
+        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) = 0;
+        Size ksize;
+        Point anchor;
+    };
+
+
+.. note:: This class does not allocate memory for a destination image. Usually this class is used inside :ocv:class:`gpu::FilterEngine_GPU`.
+
+
+
+gpu::FilterEngine_GPU
+---------------------
+.. ocv:class:: gpu::FilterEngine_GPU
+
+Base class for the Filter Engine. ::
+
+    class CV_EXPORTS FilterEngine_GPU
+    {
+    public:
+        virtual ~FilterEngine_GPU() {}
+
+        virtual void apply(const GpuMat& src, GpuMat& dst,
+                           Rect roi = Rect(0,0,-1,-1), Stream& stream = Stream::Null()) = 0;
+    };
+
+
+The class can be used to apply an arbitrary filtering operation to an image. It contains all the necessary intermediate buffers. Pointers to the initialized ``FilterEngine_GPU`` instances are returned by various ``create*Filter_GPU`` functions (see below), and they are used inside high-level functions such as :ocv:func:`gpu::filter2D`, :ocv:func:`gpu::erode`, :ocv:func:`gpu::Sobel` , and others.
+
+By using ``FilterEngine_GPU`` instead of functions you can avoid unnecessary memory allocation for intermediate buffers and get better performance: ::
+
+    while (...)
+    {
+        gpu::GpuMat src = getImg();
+        gpu::GpuMat dst;
+        // Allocate and release buffers at each iterations
+        gpu::GaussianBlur(src, dst, ksize, sigma1);
+    }
+
+    // Allocate buffers only once
+    cv::Ptr<gpu::FilterEngine_GPU> filter =
+        gpu::createGaussianFilter_GPU(CV_8UC4, ksize, sigma1);
+    while (...)
+    {
+        gpu::GpuMat src = getImg();
+        gpu::GpuMat dst;
+        filter->apply(src, dst, cv::Rect(0, 0, src.cols, src.rows));
+    }
+    // Release buffers only once
+    filter.release();
+
+
+``FilterEngine_GPU`` can process a rectangular sub-region of an image. By default, if ``roi == Rect(0,0,-1,-1)`` , ``FilterEngine_GPU`` processes the inner region of an image ( ``Rect(anchor.x, anchor.y, src_size.width - ksize.width, src_size.height - ksize.height)`` ) because some filters do not check whether indices are outside the image for better performance. See below to understand which filters support processing the whole image and which do not and identify image type limitations.
+
+.. note:: The GPU filters do not support the in-place mode.
+
+.. seealso:: :ocv:class:`gpu::BaseRowFilter_GPU`, :ocv:class:`gpu::BaseColumnFilter_GPU`, :ocv:class:`gpu::BaseFilter_GPU`, :ocv:func:`gpu::createFilter2D_GPU`, :ocv:func:`gpu::createSeparableFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`, :ocv:func:`gpu::createMorphologyFilter_GPU`, :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`gpu::createDerivFilter_GPU`, :ocv:func:`gpu::createGaussianFilter_GPU`
+
+
+
+gpu::createFilter2D_GPU
+---------------------------
+Creates a non-separable filter engine with the specified filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createFilter2D_GPU( const Ptr<BaseFilter_GPU>& filter2D, int srcType, int dstType)
+
+    :param filter2D: Non-separable 2D filter.
+
+    :param srcType: Input image type. It must be supported by  ``filter2D`` .
+
+    :param dstType: Output image type. It must be supported by  ``filter2D`` .
+
+Usually this function is used inside such high-level functions as :ocv:func:`gpu::createLinearFilter_GPU`, :ocv:func:`gpu::createBoxFilter_GPU`.
+
+
+
+gpu::createSeparableFilter_GPU
+----------------------------------
+Creates a separable filter engine with the specified filters.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableFilter_GPU( const Ptr<BaseRowFilter_GPU>& rowFilter, const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
+
+    :param rowFilter: "Horizontal" 1D filter.
+
+    :param columnFilter: "Vertical" 1D filter.
+
+    :param srcType: Input image type. It must be supported by  ``rowFilter`` .
+
+    :param bufType: Buffer image type. It must be supported by  ``rowFilter``  and  ``columnFilter`` .
+
+    :param dstType: Output image type. It must be supported by  ``columnFilter`` .
+
+Usually this function is used inside such high-level functions as :ocv:func:`gpu::createSeparableLinearFilter_GPU`.
+
+
+
+gpu::getRowSumFilter_GPU
+----------------------------
+Creates a horizontal 1D box filter.
+
+.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getRowSumFilter_GPU(int srcType, int sumType, int ksize, int anchor = -1)
+
+    :param srcType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param sumType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+
+gpu::getColumnSumFilter_GPU
+-------------------------------
+Creates a vertical 1D box filter.
+
+.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getColumnSumFilter_GPU(int sumType, int dstType, int ksize, int anchor = -1)
+
+    :param sumType: Input image type. Only ``CV_8UC1`` type is supported for now.
+
+    :param dstType: Output image type. Only ``CV_32FC1`` type is supported for now.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+
+gpu::createBoxFilter_GPU
+----------------------------
+Creates a normalized 2D box filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createBoxFilter_GPU(int srcType, int dstType, const Size& ksize, const Point& anchor = Point(-1,-1))
+
+.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getBoxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1, -1))
+
+    :param srcType: Input image type supporting ``CV_8UC1`` and ``CV_8UC4`` .
+
+    :param dstType: Output image type.  It supports only the same values as the source type.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`boxFilter`
+
+
+
+gpu::boxFilter
+------------------
+Smooths the image using the normalized box filter.
+
+.. ocv:function:: void gpu::boxFilter(const GpuMat& src, GpuMat& dst, int ddepth, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+
+    :param src: Input image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+
+    :param dst: Output image type. The size and type is the same as ``src`` .
+
+    :param ddepth: Output image depth. If -1, the output image has the same depth as the input one. The only values allowed here are ``CV_8U`` and -1.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value ``Point(-1, -1)`` means that the anchor is at the kernel center.
+
+    :param stream: Stream for the asynchronous version.
+
+.. note::    This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`boxFilter`
+
+
+
+gpu::blur
+-------------
+Acts as a synonym for the normalized box filter.
+
+.. ocv:function:: void gpu::blur(const GpuMat& src, GpuMat& dst, Size ksize, Point anchor = Point(-1,-1), Stream& stream = Stream::Null())
+
+    :param src: Input image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+
+    :param dst: Output image type with the same size and type as  ``src`` .
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
+
+    :param stream: Stream for the asynchronous version.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`blur`, :ocv:func:`gpu::boxFilter`
+
+
+
+gpu::createMorphologyFilter_GPU
+-----------------------------------
+Creates a 2D morphological filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Point& anchor = Point(-1,-1), int iterations = 1)
+
+.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor=Point(-1,-1))
+
+    :param op: Morphology operation id. Only ``MORPH_ERODE`` and ``MORPH_DILATE`` are supported.
+
+    :param type: Input/output image type. Only  ``CV_8UC1``  and  ``CV_8UC4``  are supported.
+
+    :param kernel: 2D 8-bit structuring element for the morphological operation.
+
+    :param ksize: Size of a horizontal or vertical structuring element used for separable morphological operations.
+
+    :param anchor: Anchor position within the structuring element. Negative values mean that the anchor is at the center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`createMorphologyFilter`
+
+
+
+gpu::erode
+--------------
+Erodes an image by using a specific structuring element.
+
+.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
+
+.. ocv:function:: void gpu::erode( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
+
+    :param src: Source image. Only  ``CV_8UC1``  and  ``CV_8UC4``  types are supported.
+
+    :param dst: Destination image with the same size and type as  ``src`` .
+
+    :param kernel: Structuring element used for erosion. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
+
+    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
+
+    :param iterations: Number of times erosion to be applied.
+
+    :param stream: Stream for the asynchronous version.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`erode`
+
+
+
+gpu::dilate
+---------------
+Dilates an image by using a specific structuring element.
+
+.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
+
+.. ocv:function:: void gpu::dilate( const GpuMat& src, GpuMat& dst, const Mat& kernel, GpuMat& buf, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
+
+    :param src: Source image. ``CV_8UC1`` and ``CV_8UC4`` source types are supported.
+
+    :param dst: Destination image with the same size and type as ``src``.
+
+    :param kernel: Structuring element used for dilation. If  ``kernel=Mat()``, a  3x3 rectangular structuring element is used.
+
+    :param anchor: Position of an anchor within the element. The default value  ``(-1, -1)``  means that the anchor is at the element center.
+
+    :param iterations: Number of times dilation to be applied.
+
+    :param stream: Stream for the asynchronous version.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`dilate`
+
+
+
+gpu::morphologyEx
+---------------------
+Applies an advanced morphological operation to an image.
+
+.. ocv:function::  void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, Point anchor=Point(-1, -1), int iterations=1 )
+
+.. ocv:function:: void gpu::morphologyEx( const GpuMat& src, GpuMat& dst, int op, const Mat& kernel, GpuMat& buf1, GpuMat& buf2, Point anchor=Point(-1, -1), int iterations=1, Stream& stream=Stream::Null() )
+
+    :param src: Source image.  ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+
+    :param dst: Destination image with the same size and type as  ``src`` .
+
+    :param op: Type of morphological operation. The following types are possible:
+
+        * **MORPH_OPEN** opening
+
+        * **MORPH_CLOSE** closing
+
+        * **MORPH_GRADIENT** morphological gradient
+
+        * **MORPH_TOPHAT** "top hat"
+
+        * **MORPH_BLACKHAT** "black hat"
+
+    :param kernel: Structuring element.
+
+    :param anchor: Position of an anchor within the element. The default value ``Point(-1, -1)`` means that the anchor is at the element center.
+
+    :param iterations: Number of times erosion and dilation to be applied.
+
+    :param stream: Stream for the asynchronous version.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`morphologyEx`
+
+
+
+gpu::createLinearFilter_GPU
+-------------------------------
+Creates a non-separable linear filter.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, Point anchor = Point(-1,-1), int borderType = BORDER_DEFAULT)
+
+    :param srcType: Input image type. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dstType: Output image type. The same type as ``src`` is supported.
+
+    :param kernel: 2D array of filter coefficients. Floating-point coefficients will be converted to fixed-point representation before the actual processing. Supports size up to 16. For larger kernels use :ocv:func:`gpu::convolve`.
+
+    :param anchor: Anchor point. The default value Point(-1, -1) means that the anchor is at the kernel center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+.. seealso:: :ocv:func:`createLinearFilter`
+
+
+
+gpu::filter2D
+-----------------
+Applies the non-separable 2D linear filter to an image.
+
+.. ocv:function:: void gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernel, Point anchor=Point(-1,-1), int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
+
+    :param src: Source image. Supports  ``CV_8U``  ,  ``CV_16U``  and  ``CV_32F``  one and four channel image.
+
+    :param dst: Destination image. The size and the number of channels is the same as  ``src`` .
+
+    :param ddepth: Desired depth of the destination image. If it is negative, it is the same as  ``src.depth()`` . It supports only the same depth as the source image depth.
+
+    :param kernel: 2D array of filter coefficients.
+
+    :param anchor: Anchor of the kernel that indicates the relative position of a filtered point within the kernel. The anchor resides within the kernel. The special default value (-1,-1) means that the anchor is at the kernel center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`filter2D`, :ocv:func:`gpu::convolve`
+
+
+
+gpu::Laplacian
+------------------
+Applies the Laplacian operator to an image.
+
+.. ocv:function:: void gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize = 1, double scale = 1, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null())
+
+    :param src: Source image. ``CV_8UC1``  and  ``CV_8UC4``  source types are supported.
+
+    :param dst: Destination image. The size and number of channels is the same as  ``src`` .
+
+    :param ddepth: Desired depth of the destination image. It supports only the same depth as the source image depth.
+
+    :param ksize: Aperture size used to compute the second-derivative filters (see :ocv:func:`getDerivKernels`). It must be positive and odd. Only  ``ksize``  = 1 and  ``ksize``  = 3 are supported.
+
+    :param scale: Optional scale factor for the computed Laplacian values. By default, no scaling is applied (see  :ocv:func:`getDerivKernels` ).
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+.. seealso:: :ocv:func:`Laplacian`, :ocv:func:`gpu::filter2D`
+
+
+
+gpu::getLinearRowFilter_GPU
+-------------------------------
+Creates a primitive row filter with the specified kernel.
+
+.. ocv:function:: Ptr<BaseRowFilter_GPU> gpu::getLinearRowFilter_GPU( int srcType, int bufType, const Mat& rowKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
+
+    :param srcType: Source array type. Only  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param bufType: Intermediate buffer type with as many channels as  ``srcType`` .
+
+    :param rowKernel: Filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+
+    :param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`. For details on limitations, see below.
+
+There are two versions of the algorithm: NPP and OpenCV.
+
+    * NPP version is called when ``srcType == CV_8UC1`` or ``srcType == CV_8UC4`` and ``bufType == srcType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
+
+    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside the image.
+
+.. seealso:: :ocv:func:`createSeparableLinearFilter` .
+
+
+
+gpu::getLinearColumnFilter_GPU
+----------------------------------
+Creates a primitive column filter with the specified kernel.
+
+.. ocv:function:: Ptr<BaseColumnFilter_GPU> gpu::getLinearColumnFilter_GPU( int bufType, int dstType, const Mat& columnKernel, int anchor=-1, int borderType=BORDER_DEFAULT )
+
+    :param bufType: Intermediate buffer type with as many channels as  ``dstType`` .
+
+    :param dstType: Destination array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` destination types are supported.
+
+    :param columnKernel: Filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that the anchor is positioned at the aperture center.
+
+    :param borderType: Pixel extrapolation method. For details, see  :ocv:func:`borderInterpolate` . For details on limitations, see below.
+
+There are two versions of the algorithm: NPP and OpenCV.
+
+    * NPP version is called when ``dstType == CV_8UC1`` or ``dstType == CV_8UC4`` and ``bufType == dstType`` . Otherwise, the OpenCV version is called. NPP supports only ``BORDER_CONSTANT`` border type and does not check indices outside the image.
+
+    * OpenCV version supports only ``CV_32F`` buffer depth and ``BORDER_REFLECT101`` , ``BORDER_REPLICATE`` , and ``BORDER_CONSTANT`` border types. It checks indices outside image.
+
+.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+
+
+
+gpu::createSeparableLinearFilter_GPU
+----------------------------------------
+Creates a separable linear filter engine.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, const Point& anchor = Point(-1,-1), int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
+
+    :param srcType: Source array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dstType: Destination array type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  destination types are supported.
+
+    :param rowKernel: Horizontal filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param columnKernel: Vertical filter coefficients. Support kernels with ``size <= 16`` .
+
+    :param anchor: Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center.
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction For details, see  :ocv:func:`borderInterpolate`. For details on limitations, see :ocv:func:`gpu::getLinearRowFilter_GPU`, cpp:ocv:func:`gpu::getLinearColumnFilter_GPU`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`gpu::getLinearRowFilter_GPU`, :ocv:func:`gpu::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
+
+
+
+gpu::sepFilter2D
+--------------------
+Applies a separable 2D linear filter to an image.
+
+.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+
+.. ocv:function:: void gpu::sepFilter2D( const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, Point anchor=Point(-1,-1), int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+
+
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dst: Destination image with the same size and number of channels as  ``src`` .
+
+    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+
+    :param kernelX: Horizontal filter coefficients.
+
+    :param kernelY: Vertical filter coefficients.
+
+    :param anchor: Anchor position within the kernel. The default value ``(-1, 1)`` means that the anchor is at the kernel center.
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`sepFilter2D`
+
+
+
+gpu::createDerivFilter_GPU
+------------------------------
+Creates a filter engine for the generalized Sobel operator.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int rowBorderType = BORDER_DEFAULT, int columnBorderType = -1)
+
+    :param srcType: Source image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dstType: Destination image type with as many channels as  ``srcType`` ,  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F``  depths are supported.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Aperture size. See  :ocv:func:`getDerivKernels` for details.
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
+
+
+
+gpu::Sobel
+--------------
+Applies the generalized Sobel operator to an image.
+
+.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+
+.. ocv:function:: void gpu::Sobel( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, int ksize=3, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dst: Destination image with the same size and number of channels as source image.
+
+    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+
+    :param dx: Derivative order in respect of x.
+
+    :param dy: Derivative order in respect of y.
+
+    :param ksize: Size of the extended Sobel kernel. Possible values are 1, 3, 5 or 7.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. For details, see  :ocv:func:`getDerivKernels` .
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Sobel`
+
+
+
+gpu::Scharr
+---------------
+Calculates the first x- or y- image derivative using the Scharr operator.
+
+.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+
+.. ocv:function:: void gpu::Scharr( const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, GpuMat& buf, double scale=1, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dst: Destination image with the same size and number of channels as  ``src`` has.
+
+    :param ddepth: Destination image depth.  ``CV_8U`` , ``CV_16S`` , ``CV_32S`` , and  ``CV_32F`` are supported.
+
+    :param dx: Order of the derivative x.
+
+    :param dy: Order of the derivative y.
+
+    :param scale: Optional scale factor for the computed derivative values. By default, no scaling is applied. See  :ocv:func:`getDerivKernels`  for details.
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`Scharr`
+
+
+
+gpu::createGaussianFilter_GPU
+---------------------------------
+Creates a Gaussian filter engine.
+
+.. ocv:function:: Ptr<FilterEngine_GPU> gpu::createGaussianFilter_GPU( int type, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+
+    :param type: Source and destination image type.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
+
+    :param ksize: Aperture size. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma1: Gaussian sigma in the horizontal direction. See  :ocv:func:`getGaussianKernel` for details.
+
+    :param sigma2: Gaussian sigma in the vertical direction. If 0, then  :math:`\texttt{sigma2}\leftarrow\texttt{sigma1}` .
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+.. seealso:: :ocv:func:`gpu::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
+
+
+
+gpu::GaussianBlur
+---------------------
+Smooths an image using the Gaussian filter.
+
+.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1 )
+
+.. ocv:function:: void gpu::GaussianBlur( const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& buf, double sigma1, double sigma2=0, int rowBorderType=BORDER_DEFAULT, int columnBorderType=-1, Stream& stream=Stream::Null() )
+
+    :param src: Source image.  ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1``  source types are supported.
+
+    :param dst: Destination image with the same size and type as  ``src`` .
+
+    :param ksize: Gaussian kernel size.  ``ksize.width``  and  ``ksize.height``  can differ but they both must be positive and odd. If they are zeros, they are computed from  ``sigma1``  and  ``sigma2`` .
+
+    :param sigma1: Gaussian kernel standard deviation in X direction.
+
+    :param sigma2: Gaussian kernel standard deviation in Y direction. If  ``sigma2``  is zero, it is set to be equal to  ``sigma1`` . If they are both zeros, they are computed from  ``ksize.width``  and  ``ksize.height``, respectively. See  :ocv:func:`getGaussianKernel` for details. To fully control the result regardless of possible future modification of all this semantics, you are recommended to specify all of  ``ksize`` , ``sigma1`` , and  ``sigma2`` .
+
+    :param rowBorderType: Pixel extrapolation method in the vertical direction. For details, see  :ocv:func:`borderInterpolate`.
+
+    :param columnBorderType: Pixel extrapolation method in the horizontal direction.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`gpu::createGaussianFilter_GPU`, :ocv:func:`GaussianBlur`
+
+
+
+gpu::getMaxFilter_GPU
+-------------------------
+Creates the maximum filter.
+
+.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMaxFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+
+    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
+
+    :param dstType: Output image type. It supports only the same type as the source type.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
+
+
+
+gpu::getMinFilter_GPU
+-------------------------
+Creates the minimum filter.
+
+.. ocv:function:: Ptr<BaseFilter_GPU> gpu::getMinFilter_GPU(int srcType, int dstType, const Size& ksize, Point anchor = Point(-1,-1))
+
+    :param srcType: Input image type. Only  ``CV_8UC1``  and  ``CV_8UC4`` are supported.
+
+    :param dstType: Output image type. It supports only the same type as the source type.
+
+    :param ksize: Kernel size.
+
+    :param anchor: Anchor point. The default value (-1) means that the anchor is at the kernel center.
+
+.. note:: This filter does not check out-of-border accesses, so only a proper sub-matrix of a bigger matrix has to be passed to it.
--- a/modules/gpu/doc/image_processing.rst
+++ b/modules/gpu/doc/image_processing.rst
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@@ -0,0 +1,207 @@
+Matrix Reductions
+=================
+
+.. highlight:: cpp
+
+
+
+gpu::meanStdDev
+-------------------
+Computes a mean value and a standard deviation of matrix elements.
+
+.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev)
+.. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf)
+
+    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.
+
+    :param mean: Mean value.
+
+    :param stddev: Standard deviation value.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`meanStdDev`
+
+
+
+gpu::norm
+-------------
+Returns the norm of a matrix (or difference of two matrices).
+
+.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType=NORM_L2)
+
+.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, GpuMat& buf)
+
+.. ocv:function:: double gpu::norm(const GpuMat& src1, int normType, const GpuMat& mask, GpuMat& buf)
+
+.. ocv:function:: double gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType=NORM_L2)
+
+    :param src1: Source matrix. Any matrices except 64F are supported.
+
+    :param src2: Second source matrix (if any) with the same size and type as ``src1``.
+
+    :param normType: Norm type.  ``NORM_L1`` ,  ``NORM_L2`` , and  ``NORM_INF``  are supported for now.
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`norm`
+
+
+
+gpu::sum
+------------
+Returns the sum of matrix elements.
+
+.. ocv:function:: Scalar gpu::sum(const GpuMat& src)
+
+.. ocv:function:: Scalar gpu::sum(const GpuMat& src, GpuMat& buf)
+
+.. ocv:function:: Scalar gpu::sum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
+    :param src: Source image of any depth except for ``CV_64F`` .
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`sum`
+
+
+
+gpu::absSum
+---------------
+Returns the sum of absolute values for matrix elements.
+
+.. ocv:function:: Scalar gpu::absSum(const GpuMat& src)
+
+.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, GpuMat& buf)
+
+.. ocv:function:: Scalar gpu::absSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
+    :param src: Source image of any depth except for ``CV_64F`` .
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+
+
+gpu::sqrSum
+---------------
+Returns the squared sum of matrix elements.
+
+.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src)
+
+.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, GpuMat& buf)
+
+.. ocv:function:: Scalar gpu::sqrSum(const GpuMat& src, const GpuMat& mask, GpuMat& buf)
+
+    :param src: Source image of any depth except for ``CV_64F`` .
+
+    :param mask: optional operation mask; it must have the same size as ``src1`` and ``CV_8UC1`` type.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+
+
+gpu::minMax
+---------------
+Finds global minimum and maximum matrix elements and returns their values.
+
+.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal=0, const GpuMat& mask=GpuMat())
+
+.. ocv:function:: void gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
+
+    :param src: Single-channel source image.
+
+    :param minVal: Pointer to the returned minimum value.  Use ``NULL``  if not required.
+
+    :param maxVal: Pointer to the returned maximum value.  Use ``NULL``  if not required.
+
+    :param mask: Optional mask to select a sub-matrix.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+The function does not work with ``CV_64F`` images on GPUs with the compute capability < 1.3.
+
+.. seealso:: :ocv:func:`minMaxLoc`
+
+
+
+gpu::minMaxLoc
+------------------
+Finds global minimum and maximum matrix elements and returns their values with locations.
+
+.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0, const GpuMat& mask=GpuMat())
+
+.. ocv:function:: void gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask, GpuMat& valbuf, GpuMat& locbuf)
+
+    :param src: Single-channel source image.
+
+    :param minVal: Pointer to the returned minimum value. Use ``NULL``  if not required.
+
+    :param maxVal: Pointer to the returned maximum value. Use ``NULL``  if not required.
+
+    :param minLoc: Pointer to the returned minimum location. Use ``NULL``  if not required.
+
+    :param maxLoc: Pointer to the returned maximum location. Use ``NULL``  if not required.
+
+    :param mask: Optional mask to select a sub-matrix.
+
+    :param valbuf: Optional values buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param locbuf: Optional locations buffer to avoid extra memory allocations. It is resized automatically.
+
+    The function does not work with ``CV_64F`` images on GPU with the compute capability < 1.3.
+
+.. seealso:: :ocv:func:`minMaxLoc`
+
+
+
+gpu::countNonZero
+---------------------
+Counts non-zero matrix elements.
+
+.. ocv:function:: int gpu::countNonZero(const GpuMat& src)
+
+.. ocv:function:: int gpu::countNonZero(const GpuMat& src, GpuMat& buf)
+
+    :param src: Single-channel source image.
+
+    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+The function does not work with ``CV_64F`` images on GPUs with the compute capability < 1.3.
+
+.. seealso:: :ocv:func:`countNonZero`
+
+
+
+gpu::reduce
+-----------
+Reduces a matrix to a vector.
+
+.. ocv:function:: void gpu::reduce(const GpuMat& mtx, GpuMat& vec, int dim, int reduceOp, int dtype = -1, Stream& stream = Stream::Null())
+
+    :param mtx: Source 2D matrix.
+
+    :param vec: Destination vector. Its size and type is defined by  ``dim``  and  ``dtype``  parameters.
+
+    :param dim: Dimension index along which the matrix is reduced. 0 means that the matrix is reduced to a single row. 1 means that the matrix is reduced to a single column.
+
+    :param reduceOp: Reduction operation that could be one of the following:
+
+            * **CV_REDUCE_SUM** The output is the sum of all rows/columns of the matrix.
+
+            * **CV_REDUCE_AVG** The output is the mean vector of all rows/columns of the matrix.
+
+            * **CV_REDUCE_MAX** The output is the maximum (column/row-wise) of all rows/columns of the matrix.
+
+            * **CV_REDUCE_MIN** The output is the minimum (column/row-wise) of all rows/columns of the matrix.
+
+    :param dtype: When it is negative, the destination vector will have the same type as the source matrix. Otherwise, its type will be  ``CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels())`` .
+
+The function ``reduce`` reduces the matrix to a vector by treating the matrix rows/columns as a set of 1D vectors and performing the specified operation on the vectors until a single row/column is obtained. For example, the function can be used to compute horizontal and vertical projections of a raster image. In case of ``CV_REDUCE_SUM`` and ``CV_REDUCE_AVG`` , the output may have a larger element bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction modes.
+
+.. seealso:: :ocv:func:`reduce`
--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
@@ -0,0 +1,274 @@
+Operations on Matrices
+======================
+
+.. highlight:: cpp
+
+
+
+gpu::gemm
+------------------
+Performs generalized matrix multiplication.
+
+.. ocv:function:: void gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null())
+
+    :param src1: First multiplied input matrix that should have  ``CV_32FC1`` , ``CV_64FC1`` , ``CV_32FC2`` , or  ``CV_64FC2``  type.
+
+    :param src2: Second multiplied input matrix of the same type as  ``src1`` .
+
+    :param alpha: Weight of the matrix product.
+
+    :param src3: Third optional delta matrix added to the matrix product. It should have the same type as  ``src1``  and  ``src2`` .
+
+    :param beta: Weight of  ``src3`` .
+
+    :param dst: Destination matrix. It has the proper size and the same type as input matrices.
+
+    :param flags: Operation flags:
+
+            * **GEMM_1_T** transpose  ``src1``
+            * **GEMM_2_T** transpose  ``src2``
+            * **GEMM_3_T** transpose  ``src3``
+
+    :param stream: Stream for the asynchronous version.
+
+The function performs generalized matrix multiplication similar to the ``gemm`` functions in BLAS level 3. For example, ``gemm(src1, src2, alpha, src3, beta, dst, GEMM_1_T + GEMM_3_T)`` corresponds to
+
+.. math::
+
+    \texttt{dst} =  \texttt{alpha} \cdot \texttt{src1} ^T  \cdot \texttt{src2} +  \texttt{beta} \cdot \texttt{src3} ^T
+
+.. note:: Transposition operation doesn't support  ``CV_64FC2``  input type.
+
+.. seealso:: :ocv:func:`gemm`
+
+
+
+gpu::transpose
+------------------
+Transposes a matrix.
+
+.. ocv:function:: void gpu::transpose( const GpuMat& src1, GpuMat& dst, Stream& stream=Stream::Null() )
+
+    :param src1: Source matrix. 1-, 4-, 8-byte element sizes are supported for now (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc).
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`transpose`
+
+
+
+gpu::flip
+-------------
+Flips a 2D matrix around vertical, horizontal, or both axes.
+
+.. ocv:function:: void gpu::flip( const GpuMat& a, GpuMat& b, int flipCode, Stream& stream=Stream::Null() )
+
+    :param a: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
+
+    :param b: Destination matrix.
+
+    :param flipCode: Flip mode for the source:
+
+        * ``0`` Flips around x-axis.
+
+        * ``>0`` Flips around y-axis.
+
+        * ``<0`` Flips around both axes.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`flip`
+
+
+
+gpu::LUT
+------------
+Transforms the source matrix into the destination matrix using the given look-up table: ``dst(I) = lut(src(I))``
+
+.. ocv:function:: void gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.  ``CV_8UC1``  and  ``CV_8UC3``  matrices are supported for now.
+
+    :param lut: Look-up table of 256 elements. It is a continuous ``CV_8U`` matrix.
+
+    :param dst: Destination matrix with the same depth as  ``lut``  and the same number of channels as  ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`LUT`
+
+
+
+gpu::merge
+--------------
+Makes a multi-channel matrix out of several single-channel matrices.
+
+.. ocv:function:: void gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Array/vector of source matrices.
+
+    :param n: Number of source matrices.
+
+    :param dst: Destination matrix.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`merge`
+
+
+
+gpu::split
+--------------
+Copies each plane of a multi-channel matrix into an array.
+
+.. ocv:function:: void gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination array/vector of single-channel matrices.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`split`
+
+
+
+gpu::magnitude
+------------------
+Computes magnitudes of complex matrix elements.
+
+.. ocv:function:: void gpu::magnitude( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
+
+    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param magnitude: Destination matrix of float magnitudes ( ``CV_32FC1`` ).
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`magnitude`
+
+
+
+gpu::magnitudeSqr
+---------------------
+Computes squared magnitudes of complex matrix elements.
+
+.. ocv:function:: void gpu::magnitudeSqr( const GpuMat& xy, GpuMat& magnitude, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, Stream& stream = Stream::Null())
+
+    :param xy: Source complex matrix in the interleaved format ( ``CV_32FC2`` ).
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param magnitude: Destination matrix of float magnitude squares ( ``CV_32FC1`` ).
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::phase
+--------------
+Computes polar angles of complex matrix elements.
+
+.. ocv:function:: void gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param angle: Destination matrix of angles ( ``CV_32FC1`` ).
+
+    :param angleInDegrees: Flag for angles that must be evaluated in degrees.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`phase`
+
+
+
+gpu::cartToPolar
+--------------------
+Converts Cartesian coordinates into polar.
+
+.. ocv:function:: void gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& magnitude, GpuMat& angle, bool angleInDegrees=false, Stream& stream = Stream::Null())
+
+    :param x: Source matrix containing real components ( ``CV_32FC1`` ).
+
+    :param y: Source matrix containing imaginary components ( ``CV_32FC1`` ).
+
+    :param magnitude: Destination matrix of float magnitudes ( ``CV_32FC1`` ).
+
+    :param angle: Destination matrix of angles ( ``CV_32FC1`` ).
+
+    :param angleInDegrees: Flag for angles that must be evaluated in degrees.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`cartToPolar`
+
+
+
+gpu::polarToCart
+--------------------
+Converts polar coordinates into Cartesian.
+
+.. ocv:function:: void gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees=false, Stream& stream = Stream::Null())
+
+    :param magnitude: Source matrix containing magnitudes ( ``CV_32FC1`` ).
+
+    :param angle: Source matrix containing angles ( ``CV_32FC1`` ).
+
+    :param x: Destination matrix of real components ( ``CV_32FC1`` ).
+
+    :param y: Destination matrix of imaginary components ( ``CV_32FC1`` ).
+
+    :param angleInDegrees: Flag that indicates angles in degrees.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`polarToCart`
+
+
+
+gpu::normalize
+--------------
+Normalizes the norm or value range of an array.
+
+.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double alpha = 1, double beta = 0, int norm_type = NORM_L2, int dtype = -1, const GpuMat& mask = GpuMat())
+
+.. ocv:function:: void gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+
+    :param src: input array.
+
+    :param dst: output array of the same size as  ``src`` .
+
+    :param alpha: norm value to normalize to or the lower range boundary in case of the range normalization.
+
+    :param beta: upper range boundary in case of the range normalization; it is not used for the norm normalization.
+
+    :param normType: normalization type (see the details below).
+
+    :param dtype: when negative, the output array has the same type as ``src``; otherwise, it has the same number of channels as  ``src`` and the depth ``=CV_MAT_DEPTH(dtype)``.
+
+    :param mask: optional operation mask.
+
+    :param norm_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+    :param cvt_buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
+
+.. seealso:: :ocv:func:`normalize`
--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@@ -0,0 +1,445 @@
+Per-element Operations
+=======================
+
+.. highlight:: cpp
+
+
+
+gpu::add
+------------
+Computes a matrix-matrix or matrix-scalar sum.
+
+.. ocv:function:: void gpu::add( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::add( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+
+    :param a: First source matrix.
+
+    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
+
+    :param sc: A scalar to be added to ``a`` .
+
+    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+
+    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`add`
+
+
+
+gpu::subtract
+-----------------
+Computes a matrix-matrix or matrix-scalar difference.
+
+.. ocv:function:: void gpu::subtract( const GpuMat& a, const GpuMat& b, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::subtract( const GpuMat& a, const Scalar& sc, GpuMat& c, const GpuMat& mask=GpuMat(), int dtype=-1, Stream& stream=Stream::Null() )
+
+    :param a: First source matrix.
+
+    :param b: Second source matrix to be added to ``a`` . Matrix should have the same size and type as ``a`` .
+
+    :param sc: A scalar to be added to ``a`` .
+
+    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+
+    :param mask: Optional operation mask, 8-bit single channel array, that specifies elements of the destination array to be changed.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`subtract`
+
+
+
+gpu::multiply
+-----------------
+Computes a matrix-matrix or matrix-scalar per-element product.
+
+.. ocv:function:: void gpu::multiply( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::multiply( const GpuMat& a, const Scalar& sc, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+
+    :param a: First source matrix.
+
+    :param b: Second source matrix to be multiplied by ``a`` elements.
+
+    :param sc: A scalar to be multiplied by ``a`` elements.
+
+    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+
+    :param scale: Optional scale factor.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`multiply`
+
+
+
+gpu::divide
+-----------
+Computes a matrix-matrix or matrix-scalar division.
+
+.. ocv:function:: void gpu::divide( const GpuMat& a, const GpuMat& b, GpuMat& c, double scale=1, int dtype=-1, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::divide(const GpuMat& a, const Scalar& sc, GpuMat& c, double scale = 1, int dtype = -1, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::divide( double scale, const GpuMat& b, GpuMat& c, int dtype=-1, Stream& stream=Stream::Null() )
+
+    :param a: First source matrix or a scalar.
+
+    :param b: Second source matrix. The ``a`` elements are divided by it.
+
+    :param sc: A scalar to be divided by the elements of ``a`` matrix.
+
+    :param c: Destination matrix that has the same size and number of channels as the input array(s). The depth is defined by ``dtype`` or ``a`` depth.
+
+    :param scale: Optional scale factor.
+
+    :param dtype: Optional depth of the output array.
+
+    :param stream: Stream for the asynchronous version.
+
+This function, in contrast to :ocv:func:`divide`, uses a round-down rounding mode.
+
+.. seealso:: :ocv:func:`divide`
+
+
+gpu::addWeighted
+----------------
+Computes the weighted sum of two arrays.
+
+.. ocv:function:: void gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype = -1, Stream& stream = Stream::Null())
+
+    :param src1: First source array.
+
+    :param alpha: Weight for the first array elements.
+
+    :param src2: Second source array of the same size and channel number as  ``src1`` .
+
+    :param beta: Weight for the second array elements.
+
+    :param dst: Destination array that has the same size and number of channels as the input arrays.
+
+    :param gamma: Scalar added to each sum.
+
+    :param dtype: Optional depth of the destination array. When both input arrays have the same depth, ``dtype`` can be set to ``-1``, which will be equivalent to ``src1.depth()``.
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``addWeighted`` calculates the weighted sum of two arrays as follows:
+
+.. math::
+
+    \texttt{dst} (I)= \texttt{saturate} ( \texttt{src1} (I)* \texttt{alpha} +  \texttt{src2} (I)* \texttt{beta} +  \texttt{gamma} )
+
+where ``I`` is a multi-dimensional index of array elements. In case of multi-channel arrays, each channel is processed independently.
+
+.. seealso:: :ocv:func:`addWeighted`
+
+
+
+gpu::abs
+------------
+Computes an absolute value of each matrix element.
+
+.. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`abs`
+
+
+
+gpu::sqr
+------------
+Computes a square value of each matrix element.
+
+.. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::sqrt
+------------
+Computes a square root of each matrix element.
+
+.. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`sqrt`
+
+
+
+gpu::exp
+------------
+Computes an exponent of each matrix element.
+
+.. ocv:function:: void gpu::exp( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
+
+    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param b: Destination matrix with the same size and type as ``a`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`exp`
+
+
+
+gpu::log
+------------
+Computes a natural logarithm of absolute value of each matrix element.
+
+.. ocv:function:: void gpu::log( const GpuMat& a, GpuMat& b, Stream& stream=Stream::Null() )
+
+    :param a: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
+
+    :param b: Destination matrix with the same size and type as ``a`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`log`
+
+
+
+gpu::pow
+------------
+Raises every matrix element to a power.
+
+.. ocv:function:: void gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src: Source matrix. Supports all type, except ``CV_64F`` depth.
+
+    :param power: Exponent of power.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+The function ``pow`` raises every element of the input matrix to ``p`` :
+
+.. math::
+
+    \texttt{dst} (I) =  \fork{\texttt{src}(I)^p}{if \texttt{p} is integer}{|\texttt{src}(I)|^p}{otherwise}
+
+.. seealso:: :ocv:func:`pow`
+
+
+
+gpu::absdiff
+----------------
+Computes per-element absolute difference of two matrices (or of a matrix and scalar).
+
+.. ocv:function:: void gpu::absdiff( const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::absdiff( const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream=Stream::Null() )
+
+    :param a: First source matrix.
+
+    :param b: Second source matrix to be added to ``a`` .
+
+    :param s: A scalar to be added to ``a`` .
+
+    :param c: Destination matrix with the same size and type as ``a`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`absdiff`
+
+
+
+gpu::compare
+----------------
+Compares elements of two matrices.
+
+.. ocv:function:: void gpu::compare( const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream=Stream::Null() )
+
+.. ocv:function:: void gpu::compare(const GpuMat& a, Scalar sc, GpuMat& c, int cmpop, Stream& stream = Stream::Null())
+
+    :param a: First source matrix.
+
+    :param b: Second source matrix with the same size and type as ``a`` .
+
+    :param sc: A scalar to be compared with ``a`` .
+
+    :param c: Destination matrix with the same size as ``a`` and the ``CV_8UC1`` type.
+
+    :param cmpop: Flag specifying the relation between the elements to be checked:
+
+            * **CMP_EQ:** ``a(.) == b(.)``
+            * **CMP_GT:** ``a(.) < b(.)``
+            * **CMP_GE:** ``a(.) <= b(.)``
+            * **CMP_LT:** ``a(.) < b(.)``
+            * **CMP_LE:** ``a(.) <= b(.)``
+            * **CMP_NE:** ``a(.) != b(.)``
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`compare`
+
+
+
+gpu::bitwise_not
+--------------------
+Performs a per-element bitwise inversion.
+
+.. ocv:function:: void gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+
+    :param src: Source matrix.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_or
+-------------------
+Performs a per-element bitwise disjunction of two matrices or of matrix and scalar.
+
+.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix.
+
+    :param src2: Second source matrix with the same size and type as ``src1`` .
+
+    :param dst: Destination matrix with the same size and type as ``src1`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_and
+--------------------
+Performs a per-element bitwise conjunction of two matrices or of matrix and scalar.
+
+.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix.
+
+    :param src2: Second source matrix with the same size and type as ``src1`` .
+
+    :param dst: Destination matrix with the same size and type as ``src1`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::bitwise_xor
+--------------------
+Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar.
+
+.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
+.. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix.
+
+    :param src2: Second source matrix with the same size and type as ``src1`` .
+
+    :param dst: Destination matrix with the same size and type as ``src1`` .
+
+    :param mask: Optional operation mask. 8-bit single channel image.
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::rshift
+--------------------
+Performs pixel by pixel right shift of an image by a constant value.
+
+.. ocv:function:: void gpu::rshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
+
+    :param sc: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::lshift
+--------------------
+Performs pixel by pixel right left of an image by a constant value.
+
+.. ocv:function:: void gpu::lshift( const GpuMat& src, Scalar_<int> sc, GpuMat& dst, Stream& stream=Stream::Null() )
+
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
+
+    :param sc: Constant values, one per channel.
+
+    :param dst: Destination matrix with the same size and type as ``src`` .
+
+    :param stream: Stream for the asynchronous version.
+
+
+
+gpu::min
+------------
+Computes the per-element minimum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::min(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix.
+
+    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
+
+    :param dst: Destination matrix with the same size and type as ``src1`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`min`
+
+
+
+gpu::max
+------------
+Computes the per-element maximum of two matrices (or a matrix and a scalar).
+
+.. ocv:function:: void gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null())
+
+.. ocv:function:: void gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null())
+
+    :param src1: First source matrix.
+
+    :param src2: Second source matrix or a scalar to compare ``src1`` elements with.
+
+    :param dst: Destination matrix with the same size and type as ``src1`` .
+
+    :param stream: Stream for the asynchronous version.
+
+.. seealso:: :ocv:func:`max`
--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
--- a/modules/gpu/include/opencv2/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu.hpp
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -40,6 +40,4 @@
 //
 //M*/

-#include "test_precomp.hpp"
-
-CV_GPU_TEST_MAIN("gpu")
+#include "opencv2/core/cuda_devptrs.hpp"
--- a/modules/gpu/include/opencv2/gpu/gpumat.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpumat.hpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/gpumat.hpp"
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -46,8 +46,181 @@ using namespace std;
 using namespace testing;
 using namespace perf;

+//////////////////////////////////////////////////////////////////////
+// StereoBM
+
+typedef std::tr1::tuple<string, string> pair_string;
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBM,
+            Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int preset = 0;
+    const int ndisp = 256;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
+
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() d_bm(d_imgLeft, d_imgRight, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Ptr<cv::StereoBM> bm = cv::createStereoBM(ndisp);
+
+        cv::Mat dst;
+
+        TEST_CYCLE() bm->compute(imgLeft, imgRight, dst);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// StereoBeliefPropagation
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation,
+            Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0));
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1));
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 64;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
+
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() d_bp(d_imgLeft, d_imgRight, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// StereoConstantSpaceBP
+
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
+{
+    declare.time(300.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 128;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
+
+        const cv::gpu::GpuMat d_imgLeft(imgLeft);
+        const cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() d_csbp(d_imgLeft, d_imgRight, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DisparityBilateralFilter
+
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter,
+            Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
+{
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const cv::Mat disp = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(disp.empty());
+
+    const int ndisp = 128;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
+
+        const cv::gpu::GpuMat d_img(img);
+        const cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() d_filter(d_disp, d_img, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// TransformPoints
+
 DEF_PARAM_TEST_1(Count, int);

+PERF_TEST_P(Count, Calib3D_TransformPoints,
+            Values(5000, 10000, 20000))
+{
+    const int count = GetParam();
+
+    cv::Mat src(1, count, CV_32FC3);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::transformPoints(d_src, rvec, tvec, dst);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
 //////////////////////////////////////////////////////////////////////
 // ProjectPoints

@@ -133,3 +306,66 @@ PERF_TEST_P(Count, Calib3D_SolvePnPRansac,
        CPU_SANITY_CHECK(tvec, 1e-6);
    }
 }
+
+//////////////////////////////////////////////////////////////////////
+// ReprojectImageTo3D
+
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat Q(4, 4, CV_32FC1);
+    cv::randu(Q, 0.1, 1.0);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::reprojectImageTo3D(d_src, dst, Q);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::reprojectImageTo3D(src, dst, Q);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DrawColorDisp
+
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::drawColorDisp(d_src, dst, 255);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_denoising.cpp
+++ b/modules/gpu/perf/perf_denoising.cpp
@@ -0,0 +1,230 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+#define GPU_DENOISING_IMAGE_SIZES testing::Values(perf::szVGA, perf::sz720p)
+
+//////////////////////////////////////////////////////////////////////
+// BilateralFilter
+
+DEF_PARAM_TEST(Sz_Depth_Cn_KernelSz, cv::Size, MatDepth, MatCn, int);
+
+PERF_TEST_P(Sz_Depth_Cn_KernelSz, Denoising_BilateralFilter,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values(CV_8U, CV_32F),
+                    GPU_CHANNELS_1_3,
+                    Values(3, 5, 9)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int kernel_size = GET_PARAM(3);
+
+    const float sigma_color = 7;
+    const float sigma_spatial = 5;
+    const int borderMode = cv::BORDER_REFLECT101;
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::bilateralFilter(d_src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::bilateralFilter(src, dst, kernel_size, sigma_color, sigma_spatial, borderMode);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// nonLocalMeans
+
+DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
+
+PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_NonLocalMeans,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(5)))
+{
+    declare.time(600.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int channels = GET_PARAM(2);
+    const int search_widow_size = GET_PARAM(3);
+    const int block_size = GET_PARAM(4);
+
+    const float h = 10;
+    const int borderMode = cv::BORDER_REFLECT101;
+
+    const int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::nonLocalMeans(d_src, dst, h, search_widow_size, block_size, borderMode);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        FAIL_NO_CPU();
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// fastNonLocalMeans
+
+DEF_PARAM_TEST(Sz_Depth_Cn_WinSz_BlockSz, cv::Size, MatDepth, MatCn, int, int);
+
+PERF_TEST_P(Sz_Depth_Cn_WinSz_BlockSz, Denoising_FastNonLocalMeans,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    GPU_CHANNELS_1_3,
+                    Values(21),
+                    Values(7)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);
+
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::FastNonLocalMeansDenoising fnlmd;
+
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() fnlmd.simpleMethod(d_src, dst, h, search_widow_size, block_size);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::fastNlMeansDenoising(src, dst, h, block_size, search_widow_size);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// fastNonLocalMeans (colored)
+
+DEF_PARAM_TEST(Sz_Depth_WinSz_BlockSz, cv::Size, MatDepth, int, int);
+
+PERF_TEST_P(Sz_Depth_WinSz_BlockSz, Denoising_FastNonLocalMeansColored,
+            Combine(GPU_DENOISING_IMAGE_SIZES,
+                    Values<MatDepth>(CV_8U),
+                    Values(21),
+                    Values(7)))
+{
+    declare.time(60.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+    const int search_widow_size = GET_PARAM(2);
+    const int block_size = GET_PARAM(3);
+
+    const float h = 10;
+    const int type = CV_MAKE_TYPE(depth, 3);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::FastNonLocalMeansDenoising fnlmd;
+
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() fnlmd.labMethod(d_src, dst, h, h, search_widow_size, block_size);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::fastNlMeansDenoisingColored(src, dst, h, h, block_size, search_widow_size);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -0,0 +1,309 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+//////////////////////////////////////////////////////////////////////
+// FAST
+
+DEF_PARAM_TEST(Image_Threshold_NonMaxSupression, string, int, bool);
+
+PERF_TEST_P(Image_Threshold_NonMaxSupression, Features2D_FAST,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(20),
+                    Bool()))
+{
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const int threshold = GET_PARAM(1);
+    const bool nonMaxSuppersion = GET_PARAM(2);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::FAST_GPU d_fast(threshold, nonMaxSuppersion, 0.5);
+
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints;
+
+        TEST_CYCLE() d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_fast.downloadKeypoints(d_keypoints, gpu_keypoints);
+
+        sortKeyPoints(gpu_keypoints);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> cpu_keypoints;
+
+        TEST_CYCLE() cv::FAST(img, cpu_keypoints, threshold, nonMaxSuppersion);
+
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ORB
+
+DEF_PARAM_TEST(Image_NFeatures, string, int);
+
+PERF_TEST_P(Image_NFeatures, Features2D_ORB,
+            Combine(Values<string>("gpu/perf/aloe.png"),
+                    Values(4000)))
+{
+    declare.time(300.0);
+
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const int nFeatures = GET_PARAM(1);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::ORB_GPU d_orb(nFeatures);
+
+        const cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;
+
+        TEST_CYCLE() d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        std::vector<cv::KeyPoint> gpu_keypoints;
+        d_orb.downloadKeyPoints(d_keypoints, gpu_keypoints);
+
+        cv::Mat gpu_descriptors(d_descriptors);
+
+        gpu_keypoints.resize(10);
+        gpu_descriptors = gpu_descriptors.rowRange(0, 10);
+
+        sortKeyPoints(gpu_keypoints, gpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(gpu_keypoints);
+        SANITY_CHECK(gpu_descriptors);
+    }
+    else
+    {
+        cv::ORB orb(nFeatures);
+
+        std::vector<cv::KeyPoint> cpu_keypoints;
+        cv::Mat cpu_descriptors;
+
+        TEST_CYCLE() orb(img, cv::noArray(), cpu_keypoints, cpu_descriptors);
+
+        SANITY_CHECK_KEYPOINTS(cpu_keypoints);
+        SANITY_CHECK(cpu_descriptors);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFMatch
+
+DEF_PARAM_TEST(DescSize_Norm, int, NormType);
+
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(20.0);
+
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);
+
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    declare.in(query, WARMUP_RNG);
+
+    cv::Mat train(3000, desc_size, type);
+    declare.in(train, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance;
+
+        TEST_CYCLE() d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+
+        std::vector<cv::DMatch> gpu_matches;
+        d_matcher.matchDownload(d_trainIdx, d_distance, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector<cv::DMatch> cpu_matches;
+
+        TEST_CYCLE() matcher.match(query, train, cpu_matches);
+
+        SANITY_CHECK_MATCHES(cpu_matches);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFKnnMatch
+
+static void toOneRowMatches(const std::vector< std::vector<cv::DMatch> >& src, std::vector<cv::DMatch>& dst)
+{
+    dst.clear();
+    for (size_t i = 0; i < src.size(); ++i)
+        for (size_t j = 0; j < src[i].size(); ++j)
+            dst.push_back(src[i][j]);
+}
+
+DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);
+
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch,
+            Combine(Values(64, 128, 256),
+                    Values(2, 3),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    declare.time(30.0);
+
+    const int desc_size = GET_PARAM(0);
+    const int k = GET_PARAM(1);
+    const int normType = GET_PARAM(2);
+
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    declare.in(query, WARMUP_RNG);
+
+    cv::Mat train(3000, desc_size, type);
+    declare.in(train, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;
+
+        TEST_CYCLE() d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher.knnMatchDownload(d_trainIdx, d_distance, matchesTbl);
+
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+
+        TEST_CYCLE() matcher.knnMatch(query, train, matchesTbl, k);
+
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);
+
+        SANITY_CHECK_MATCHES(cpu_matches);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFRadiusMatch
+
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch,
+            Combine(Values(64, 128, 256),
+                    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2))))
+{
+    declare.time(30.0);
+
+    const int desc_size = GET_PARAM(0);
+    const int normType = GET_PARAM(1);
+
+    const int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+    const float maxDistance = 10000;
+
+    cv::Mat query(3000, desc_size, type);
+    declare.in(query, WARMUP_RNG);
+
+    cv::Mat train(3000, desc_size, type);
+    declare.in(train, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        const cv::gpu::GpuMat d_query(query);
+        const cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;
+
+        TEST_CYCLE() d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, maxDistance);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+        d_matcher.radiusMatchDownload(d_trainIdx, d_distance, d_nMatches, matchesTbl);
+
+        std::vector<cv::DMatch> gpu_matches;
+        toOneRowMatches(matchesTbl, gpu_matches);
+
+        SANITY_CHECK_MATCHES(gpu_matches);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matchesTbl;
+
+        TEST_CYCLE() matcher.radiusMatch(query, train, matchesTbl, maxDistance);
+
+        std::vector<cv::DMatch> cpu_matches;
+        toOneRowMatches(matchesTbl, cpu_matches);
+
+        SANITY_CHECK_MATCHES(cpu_matches);
+    }
+}
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -0,0 +1,366 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+using namespace perf;
+
+//////////////////////////////////////////////////////////////////////
+// Blur
+
+DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur,
+            Combine(GPU_TYPICAL_MAT_SIZES,
+                    Values(CV_8UC1, CV_8UC4),
+                    Values(3, 5, 7)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::blur(d_src, dst, cv::Size(ksize, ksize));
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::blur(src, dst, cv::Size(ksize, ksize));
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sobel
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::Sobel(d_src, dst, -1, 1, 1, d_buf, ksize);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Sobel(src, dst, -1, 1, 1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Scharr
+
+PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::Scharr(d_src, dst, -1, 1, 0, d_buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Scharr(src, dst, -1, 1, 0);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// GaussianBlur
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::GaussianBlur(d_src, dst, cv::Size(ksize, ksize), d_buf, 0.5);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::Laplacian(d_src, dst, -1, ksize);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::Laplacian(src, dst, -1, ksize);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Erode
+
+PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::erode(d_src, dst, ker, d_buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::erode(src, dst, ker);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Dilate
+
+PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf;
+
+        TEST_CYCLE() cv::gpu::dilate(d_src, dst, ker, d_buf);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::dilate(src, dst, ker);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MorphologyEx
+
+CV_ENUM(MorphOp, MORPH_OPEN, MORPH_CLOSE, MORPH_GRADIENT, MORPH_TOPHAT, MORPH_BLACKHAT)
+
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+
+PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), MorphOp::all()))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int morphOp = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    const cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+        cv::gpu::GpuMat d_buf1;
+        cv::gpu::GpuMat d_buf2;
+
+        TEST_CYCLE() cv::gpu::morphologyEx(d_src, dst, morphOp, ker, d_buf1, d_buf2);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::morphologyEx(src, dst, morphOp, ker);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+    const int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    declare.in(src, WARMUP_RNG);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    declare.in(kernel, WARMUP_RNG);
+
+    if (PERF_RUN_GPU())
+    {
+        const cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat dst;
+
+        TEST_CYCLE() cv::gpu::filter2D(d_src, dst, -1, kernel);
+
+        GPU_SANITY_CHECK(dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        TEST_CYCLE() cv::filter2D(src, dst, -1, kernel);
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@@ -51,12 +51,21 @@
 #ifndef __OPENCV_PERF_PRECOMP_HPP__
 #define __OPENCV_PERF_PRECOMP_HPP__

+#include <cstdio>
+#include <iostream>
+
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/gpu_perf.hpp"

+#include "opencv2/core.hpp"
+#include "opencv2/highgui.hpp"
 #include "opencv2/gpu.hpp"
 #include "opencv2/calib3d.hpp"
-#include "opencv2/objdetect.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/video.hpp"
+#include "opencv2/photo.hpp"
+
+#include "opencv2/core/gpu_private.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY
 #error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf4au/CMakeLists.txt
+++ b/modules/gpu/perf4au/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video opencv_legacy opencv_ml opencv_ts opencv_gpufilters opencv_gpuimgproc opencv_gpuoptflow)
+set(PERF4AU_REQUIRED_DEPS opencv_core opencv_imgproc opencv_highgui opencv_video opencv_legacy opencv_gpu opencv_ts)

 ocv_check_dependencies(${PERF4AU_REQUIRED_DEPS})

@@ -25,3 +25,4 @@ if(WIN32)
        set_target_properties(${the_target} PROPERTIES LINK_FLAGS "/NODEFAULTLIB:atlthunk.lib /NODEFAULTLIB:atlsd.lib /DEBUG")
    endif()
 endif()
+
--- a/modules/gpu/perf4au/main.cpp
+++ b/modules/gpu/perf4au/main.cpp
@@ -40,15 +40,18 @@
 //
 //M*/

-#include "opencv2/ts.hpp"
-#include "opencv2/ts/gpu_perf.hpp"
-
-#include "opencv2/gpuimgproc.hpp"
-#include "opencv2/gpuoptflow.hpp"
+#include <cstdio>

+#ifdef HAVE_CVCONFIG_H
+#include "cvconfig.h"
+#endif
+#include "opencv2/core.hpp"
+#include "opencv2/gpu.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/video.hpp"
 #include "opencv2/legacy.hpp"
+#include "opencv2/ts.hpp"
+#include "opencv2/ts/gpu_perf.hpp"

 int main(int argc, char* argv[])
 {
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -0,0 +1,565 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
+void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
+void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// gemm
+
+void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
+{
+#ifndef HAVE_CUBLAS
+    (void)src1;
+    (void)src2;
+    (void)alpha;
+    (void)src3;
+    (void)beta;
+    (void)dst;
+    (void)flags;
+    (void)stream;
+    CV_Error(cv::Error::StsNotImplemented, "The library was build without CUBLAS");
+#else
+    // CUBLAS works with column-major matrices
+
+    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
+    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
+
+    if (src1.depth() == CV_64F)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+    }
+
+    bool tr1 = (flags & GEMM_1_T) != 0;
+    bool tr2 = (flags & GEMM_2_T) != 0;
+    bool tr3 = (flags & GEMM_3_T) != 0;
+
+    if (src1.type() == CV_64FC2)
+    {
+        if (tr1 || tr2 || tr3)
+            CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
+    }
+
+    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
+    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
+    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
+    Size dstSize(src2Size.width, src1Size.height);
+
+    CV_Assert(src1Size.width == src2Size.height);
+    CV_Assert(src3.empty() || src3Size == dstSize);
+
+    dst.create(dstSize, src1.type());
+
+    if (beta != 0)
+    {
+        if (src3.empty())
+        {
+            if (stream)
+                stream.enqueueMemSet(dst, Scalar::all(0));
+            else
+                dst.setTo(Scalar::all(0));
+        }
+        else
+        {
+            if (tr3)
+            {
+                transpose(src3, dst, stream);
+            }
+            else
+            {
+                if (stream)
+                    stream.enqueueCopy(src3, dst);
+                else
+                    src3.copyTo(dst);
+            }
+        }
+    }
+
+    cublasHandle_t handle;
+    cublasSafeCall( cublasCreate_v2(&handle) );
+
+    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );
+
+    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );
+
+    const float alphaf = static_cast<float>(alpha);
+    const float betaf = static_cast<float>(beta);
+
+    const cuComplex alphacf = make_cuComplex(alphaf, 0);
+    const cuComplex betacf = make_cuComplex(betaf, 0);
+
+    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
+    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
+
+    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+    switch (src1.type())
+    {
+    case CV_32FC1:
+        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphaf,
+            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
+            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
+            &betaf,
+            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
+        break;
+
+    case CV_64FC1:
+        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alpha,
+            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
+            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
+            &beta,
+            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
+        break;
+
+    case CV_32FC2:
+        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphacf,
+            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
+            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
+            &betacf,
+            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
+        break;
+
+    case CV_64FC2:
+        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
+            &alphac,
+            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
+            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
+            &betac,
+            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
+        break;
+    }
+
+    cublasSafeCall( cublasDestroy_v2(handle) );
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////
+// transpose
+
+void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
+{
+    CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8);
+
+    dst.create( src.cols, src.rows, src.type() );
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+
+    if (src.elemSize() == 1)
+    {
+        NppStreamHandler h(stream);
+
+        NppiSize sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
+    }
+    else if (src.elemSize() == 4)
+    {
+        NppStStreamHandler h(stream);
+
+        NcvSize32u sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
+            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
+    }
+    else // if (src.elemSize() == 8)
+    {
+        if (!deviceSupports(NATIVE_DOUBLE))
+            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
+
+        NppStStreamHandler h(stream);
+
+        NcvSize32u sz;
+        sz.width  = src.cols;
+        sz.height = src.rows;
+
+        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
+            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
+    }
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+////////////////////////////////////////////////////////////////////////
+// flip
+
+namespace
+{
+    template<int DEPTH> struct NppTypeTraits;
+    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
+    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
+    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
+    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
+    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
+    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
+    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
+
+    template <int DEPTH> struct NppMirrorFunc
+    {
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
+
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
+    };
+
+    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
+    {
+        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
+
+        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
+        {
+            NppStreamHandler h(stream);
+
+            NppiSize sz;
+            sz.width  = src.cols;
+            sz.height = src.rows;
+
+            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
+                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
+                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
+{
+    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
+        {0,0,0,0},
+        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
+        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
+    };
+
+    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
+    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
+
+    dst.create(src.size(), src.type());
+
+    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// LUT
+
+void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
+{
+    const int cn = src.channels();
+
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
+    CV_Assert( lut.depth() == CV_8U );
+    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
+    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
+
+    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
+
+    NppiSize sz;
+    sz.height = src.rows;
+    sz.width = src.cols;
+
+    Mat nppLut;
+    lut.convertTo(nppLut, CV_32S);
+
+    int nValues3[] = {256, 256, 256};
+
+    Npp32s pLevels[256];
+    for (int i = 0; i < 256; ++i)
+        pLevels[i] = i;
+
+    const Npp32s* pLevels3[3];
+
+#if (CUDA_VERSION <= 4020)
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
+#else
+    GpuMat d_pLevels;
+    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
+    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
+#endif
+
+    cudaStream_t stream = StreamAccessor::getStream(s);
+    NppStreamHandler h(stream);
+
+    if (src.type() == CV_8UC1)
+    {
+#if (CUDA_VERSION <= 4020)
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
+#else
+        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
+        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
+#endif
+    }
+    else
+    {
+        const Npp32s* pValues3[3];
+
+        Mat nppLut3[3];
+        if (nppLut.channels() == 1)
+        {
+#if (CUDA_VERSION <= 4020)
+            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
+#else
+            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
+            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
+#endif
+        }
+        else
+        {
+            cv::split(nppLut, nppLut3);
+
+#if (CUDA_VERSION <= 4020)
+            pValues3[0] = nppLut3[0].ptr<Npp32s>();
+            pValues3[1] = nppLut3[1].ptr<Npp32s>();
+            pValues3[2] = nppLut3[2].ptr<Npp32s>();
+#else
+            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
+            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
+            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
+
+            pValues3[0] = d_nppLut0.ptr<Npp32s>();
+            pValues3[1] = d_nppLut1.ptr<Npp32s>();
+            pValues3[2] = d_nppLut2.ptr<Npp32s>();
+#endif
+        }
+
+        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
+    }
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+////////////////////////////////////////////////////////////////////////
+// NPP magnitide
+
+namespace
+{
+    typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
+
+    inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
+    {
+        CV_Assert(src.type() == CV_32FC2);
+
+        dst.create(src.size(), CV_32FC1);
+
+        NppiSize sz;
+        sz.width = src.cols;
+        sz.height = src.rows;
+
+        NppStreamHandler h(stream);
+
+        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// Polar <-> Cart
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace mathfunc
+    {
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
+    {
+        using namespace ::cv::gpu::cudev::mathfunc;
+
+        CV_Assert(x.size() == y.size() && x.type() == y.type());
+        CV_Assert(x.depth() == CV_32F);
+
+        if (mag)
+            mag->create(x.size(), x.type());
+        if (angle)
+            angle->create(x.size(), x.type());
+
+        GpuMat x1cn = x.reshape(1);
+        GpuMat y1cn = y.reshape(1);
+        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
+        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();
+
+        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
+    }
+
+    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
+    {
+        using namespace ::cv::gpu::cudev::mathfunc;
+
+        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
+        CV_Assert(mag.depth() == CV_32F);
+
+        x.create(mag.size(), mag.type());
+        y.create(mag.size(), mag.type());
+
+        GpuMat mag1cn = mag.reshape(1);
+        GpuMat angle1cn = angle.reshape(1);
+        GpuMat x1cn = x.reshape(1);
+        GpuMat y1cn = y.reshape(1);
+
+        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
+    }
+}
+
+void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
+{
+    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
+{
+    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
+{
+    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
+{
+    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
+{
+    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// normalize
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
+{
+    GpuMat norm_buf;
+    GpuMat cvt_buf;
+    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
+}
+
+void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
+{
+    double scale = 1, shift = 0;
+    if (norm_type == NORM_MINMAX)
+    {
+        double smin = 0, smax = 0;
+        double dmin = std::min(a, b), dmax = std::max(a, b);
+        minMax(src, &smin, &smax, mask, norm_buf);
+        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
+        shift = dmin - smin * scale;
+    }
+    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
+    {
+        scale = norm(src, norm_type, mask, norm_buf);
+        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
+        shift = 0;
+    }
+    else
+    {
+        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
+    }
+
+    if (mask.empty())
+    {
+        src.convertTo(dst, dtype, scale, shift);
+    }
+    else
+    {
+        src.convertTo(cvt_buf, dtype, scale, shift);
+        cvt_buf.copyTo(dst, mask);
+    }
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@@ -0,0 +1,168 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+cv::gpu::GMG_GPU::GMG_GPU() { throw_no_cuda(); }
+void cv::gpu::GMG_GPU::initialize(cv::Size, float, float) { throw_no_cuda(); }
+void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, cv::gpu::Stream&) { throw_no_cuda(); }
+void cv::gpu::GMG_GPU::release() {}
+
+#else
+
+namespace cv { namespace gpu { namespace cudev {
+    namespace bgfg_gmg
+    {
+        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
+                           float decisionThreshold, int maxFeatures, int numInitializationFrames);
+
+        template <typename SrcT>
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
+                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+    }
+}}}
+
+cv::gpu::GMG_GPU::GMG_GPU()
+{
+    maxFeatures = 64;
+    learningRate = 0.025f;
+    numInitializationFrames = 120;
+    quantizationLevels = 16;
+    backgroundPrior = 0.8f;
+    decisionThreshold = 0.8f;
+    smoothingRadius = 7;
+    updateBackgroundModel = true;
+}
+
+void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
+{
+    using namespace cv::gpu::cudev::bgfg_gmg;
+
+    CV_Assert(min < max);
+    CV_Assert(maxFeatures > 0);
+    CV_Assert(learningRate >= 0.0f && learningRate <= 1.0f);
+    CV_Assert(numInitializationFrames >= 1);
+    CV_Assert(quantizationLevels >= 1 && quantizationLevels <= 255);
+    CV_Assert(backgroundPrior >= 0.0f && backgroundPrior <= 1.0f);
+
+    minVal_ = min;
+    maxVal_ = max;
+
+    frameSize_ = frameSize;
+
+    frameNum_ = 0;
+
+    nfeatures_.create(frameSize_, CV_32SC1);
+    colors_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32SC1);
+    weights_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32FC1);
+
+    nfeatures_.setTo(cv::Scalar::all(0));
+
+    if (smoothingRadius > 0)
+        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
+
+    loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
+}
+
+void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float newLearningRate, cv::gpu::Stream& stream)
+{
+    using namespace cv::gpu::cudev::bgfg_gmg;
+
+    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
+                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+    static const func_t funcs[6][4] =
+    {
+        {update_gpu<uchar>, 0, update_gpu<uchar3>, update_gpu<uchar4>},
+        {0,0,0,0},
+        {update_gpu<ushort>, 0, update_gpu<ushort3>, update_gpu<ushort4>},
+        {0,0,0,0},
+        {0,0,0,0},
+        {update_gpu<float>, 0, update_gpu<float3>, update_gpu<float4>}
+    };
+
+    CV_Assert(frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F);
+    CV_Assert(frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4);
+
+    if (newLearningRate != -1.0f)
+    {
+        CV_Assert(newLearningRate >= 0.0f && newLearningRate <= 1.0f);
+        learningRate = newLearningRate;
+    }
+
+    if (frame.size() != frameSize_)
+        initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0f);
+
+    fgmask.create(frameSize_, CV_8UC1);
+    if (stream)
+        stream.enqueueMemSet(fgmask, cv::Scalar::all(0));
+    else
+        fgmask.setTo(cv::Scalar::all(0));
+
+    funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream));
+
+    // medianBlur
+    if (smoothingRadius > 0)
+    {
+        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
+        int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
+        double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
+        cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
+    }
+
+    // keep track of how many frames we have processed
+    ++frameNum_;
+}
+
+void cv::gpu::GMG_GPU::release()
+{
+    frameSize_ = Size();
+
+    nfeatures_.release();
+    colors_.release();
+    weights_.release();
+    boxFilter_.release();
+    buf_.release();
+}
+
+#endif
--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@@ -0,0 +1,279 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+cv::gpu::MOG_GPU::MOG_GPU(int) { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, Stream&) { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
+void cv::gpu::MOG_GPU::release() {}
+
+cv::gpu::MOG2_GPU::MOG2_GPU(int) { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::operator()(const GpuMat&, GpuMat&, float, Stream&) { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
+void cv::gpu::MOG2_GPU::release() {}
+
+#else
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace mog
+    {
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
+                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
+                     cudaStream_t stream);
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+
+        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
+    }
+}}}
+
+namespace mog
+{
+    const int defaultNMixtures = 5;
+    const int defaultHistory = 200;
+    const float defaultBackgroundRatio = 0.7f;
+    const float defaultVarThreshold = 2.5f * 2.5f;
+    const float defaultNoiseSigma = 30.0f * 0.5f;
+    const float defaultInitialWeight = 0.05f;
+}
+
+cv::gpu::MOG_GPU::MOG_GPU(int nmixtures) :
+    frameSize_(0, 0), frameType_(0), nframes_(0)
+{
+    nmixtures_ = std::min(nmixtures > 0 ? nmixtures : mog::defaultNMixtures, 8);
+    history = mog::defaultHistory;
+    varThreshold = mog::defaultVarThreshold;
+    backgroundRatio = mog::defaultBackgroundRatio;
+    noiseSigma = mog::defaultNoiseSigma;
+}
+
+void cv::gpu::MOG_GPU::initialize(cv::Size frameSize, int frameType)
+{
+    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
+
+    frameSize_ = frameSize;
+    frameType_ = frameType;
+
+    int ch = CV_MAT_CN(frameType);
+    int work_ch = ch;
+
+    // for each gaussian mixture of each pixel bg model we store
+    // the mixture sort key (w/sum_of_variances), the mixture weight (w),
+    // the mean (nchannels values) and
+    // the diagonal covariance matrix (another nchannels values)
+
+    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+    sortKey_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
+    var_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
+
+    weight_.setTo(cv::Scalar::all(0));
+    sortKey_.setTo(cv::Scalar::all(0));
+    mean_.setTo(cv::Scalar::all(0));
+    var_.setTo(cv::Scalar::all(0));
+
+    nframes_ = 0;
+}
+
+void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float learningRate, Stream& stream)
+{
+    using namespace cv::gpu::cudev::mog;
+
+    CV_Assert(frame.depth() == CV_8U);
+
+    int ch = frame.channels();
+    int work_ch = ch;
+
+    if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
+        initialize(frame.size(), frame.type());
+
+    fgmask.create(frameSize_, CV_8UC1);
+
+    ++nframes_;
+    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(nframes_, history);
+    CV_Assert(learningRate >= 0.0f);
+
+    mog_gpu(frame, ch, fgmask, weight_, sortKey_, mean_, var_, nmixtures_,
+            varThreshold, learningRate, backgroundRatio, noiseSigma,
+            StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
+{
+    using namespace cv::gpu::cudev::mog;
+
+    backgroundImage.create(frameSize_, frameType_);
+
+    getBackgroundImage_gpu(backgroundImage.channels(), weight_, mean_, backgroundImage, nmixtures_, backgroundRatio, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::MOG_GPU::release()
+{
+    frameSize_ = Size(0, 0);
+    frameType_ = 0;
+    nframes_ = 0;
+
+    weight_.release();
+    sortKey_.release();
+    mean_.release();
+    var_.release();
+}
+
+/////////////////////////////////////////////////////////////////
+// MOG2
+
+namespace mog2
+{
+    // default parameters of gaussian background detection algorithm
+    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
+    const float defaultVarThreshold = 4.0f * 4.0f;
+    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
+    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
+    const float defaultVarThresholdGen = 3.0f * 3.0f;
+    const float defaultVarInit = 15.0f; // initial variance for new components
+    const float defaultVarMax = 5.0f * defaultVarInit;
+    const float defaultVarMin = 4.0f;
+
+    // additional parameters
+    const float defaultfCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
+    const unsigned char defaultnShadowDetection = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
+    const float defaultfTau = 0.5f; // Tau - shadow threshold, see the paper for explanation
+}
+
+cv::gpu::MOG2_GPU::MOG2_GPU(int nmixtures) :
+    frameSize_(0, 0), frameType_(0), nframes_(0)
+{
+    nmixtures_ = nmixtures > 0 ? nmixtures : mog2::defaultNMixtures;
+
+    history = mog2::defaultHistory;
+    varThreshold = mog2::defaultVarThreshold;
+    bShadowDetection = true;
+
+    backgroundRatio = mog2::defaultBackgroundRatio;
+    fVarInit = mog2::defaultVarInit;
+    fVarMax  = mog2::defaultVarMax;
+    fVarMin = mog2::defaultVarMin;
+
+    varThresholdGen = mog2::defaultVarThresholdGen;
+    fCT = mog2::defaultfCT;
+    nShadowDetection =  mog2::defaultnShadowDetection;
+    fTau = mog2::defaultfTau;
+}
+
+void cv::gpu::MOG2_GPU::initialize(cv::Size frameSize, int frameType)
+{
+    using namespace cv::gpu::cudev::mog;
+
+    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
+
+    frameSize_ = frameSize;
+    frameType_ = frameType;
+    nframes_ = 0;
+
+    int ch = CV_MAT_CN(frameType);
+    int work_ch = ch;
+
+    // for each gaussian mixture of each pixel bg model we store ...
+    // the mixture weight (w),
+    // the mean (nchannels values) and
+    // the covariance
+    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+    variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
+    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
+
+    //make the array for keeping track of the used modes per pixel - all zeros at start
+    bgmodelUsedModes_.create(frameSize_, CV_8UC1);
+    bgmodelUsedModes_.setTo(cv::Scalar::all(0));
+
+    loadConstants(nmixtures_, varThreshold, backgroundRatio, varThresholdGen, fVarInit, fVarMin, fVarMax, fTau, nShadowDetection);
+}
+
+void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate, Stream& stream)
+{
+    using namespace cv::gpu::cudev::mog;
+
+    int ch = frame.channels();
+    int work_ch = ch;
+
+    if (nframes_ == 0 || learningRate >= 1.0f || frame.size() != frameSize_ || work_ch != mean_.channels())
+        initialize(frame.size(), frame.type());
+
+    fgmask.create(frameSize_, CV_8UC1);
+    fgmask.setTo(cv::Scalar::all(0));
+
+    ++nframes_;
+    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(2 * nframes_, history);
+    CV_Assert(learningRate >= 0.0f);
+
+    mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_, learningRate, -learningRate * fCT, bShadowDetection, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
+{
+    using namespace cv::gpu::cudev::mog;
+
+    backgroundImage.create(frameSize_, frameType_);
+
+    getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, StreamAccessor::getStream(stream));
+}
+
+void cv::gpu::MOG2_GPU::release()
+{
+    frameSize_ = Size(0, 0);
+    frameType_ = 0;
+    nframes_ = 0;
+
+    weight_.release();
+    variance_.release();
+    mean_.release();
+
+    bgmodelUsedModes_.release();
+}
+
+#endif
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
+
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace disp_bilateral_filter
+    {
+        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
+
+        template<typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::cudev::disp_bilateral_filter;
+
+namespace
+{
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+    {
+        Mat cpu_table_color(1, len, CV_32F);
+
+        float* line = cpu_table_color.ptr<float>();
+
+        for(int i = 0; i < len; i++)
+            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
+
+        table_color.upload(cpu_table_color);
+    }
+
+    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    {
+        int half = (win_size >> 1);
+
+        Mat cpu_table_space(half + 1, half + 1, CV_32F);
+
+        for (int y = 0; y <= half; ++y)
+        {
+            float* row = cpu_table_space.ptr<float>(y);
+            for (int x = 0; x <= half; ++x)
+                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
+        }
+
+        table_space.upload(cpu_table_space);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
+                                   GpuMat& table_color, GpuMat& table_space,
+                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+    {
+        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        short max_disc = short(ndisp * max_disc_threshold + 0.5);
+
+        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+
+        if (&dst != &disp)
+        {
+            if (stream)
+                stream.enqueueCopy(disp, dst);
+            else
+                disp.copyTo(dst);
+        }
+
+        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+    }
+
+    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                GpuMat& table_color, GpuMat& table_space,
+                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
+
+    const bilateral_filter_operator_t operators[] =
+        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+}
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
+    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
+      sigma_range(DEFAULT_SIGMA_RANGE)
+{
+    calc_color_weighted_table(table_color, sigma_range, 255);
+    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
+}
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
+                                                     float max_disc_threshold_, float sigma_range_)
+    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
+      sigma_range(sigma_range_)
+{
+    calc_color_weighted_table(table_color, sigma_range, 255);
+    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
+}
+
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+{
+    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
+    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
+    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace blend
+    {
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::cudev::blend;
+
+void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
+                          GpuMat& result, Stream& stream)
+{
+    CV_Assert(img1.size() == img2.size());
+    CV_Assert(img1.type() == img2.type());
+    CV_Assert(weights1.size() == img1.size());
+    CV_Assert(weights2.size() == img2.size());
+    CV_Assert(weights1.type() == CV_32F);
+    CV_Assert(weights2.type() == CV_32F);
+
+    const Size size = img1.size();
+    const int depth = img1.depth();
+    const int cn = img1.channels();
+
+    result.create(size, CV_MAKE_TYPE(depth, cn));
+
+    switch (depth)
+    {
+    case CV_8U:
+        if (cn != 4)
+            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        else
+            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    case CV_32F:
+        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    default:
+        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
+    }
+}
+
+#endif
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -48,7 +48,9 @@ using namespace cv::gpu;
 #if !defined HAVE_CUDA || defined(CUDA_DISABLER)

 void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
+
 void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, std::vector<int>*) { throw_no_cuda(); }

 #else
@@ -148,7 +150,7 @@ namespace
    }

    // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator : public cv::ParallelLoopBody
+    class TransformHypothesesGenerator
    {
    public:
        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
@@ -158,7 +160,7 @@ namespace
                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                  transl_vectors(transl_vectors_) {}

-        void operator()(const Range& range) const
+        void operator()(const BlockedRange& range) const
        {
            // Input data for generation of the current hypothesis
            std::vector<int> subset_indices(subset_size);
@@ -170,7 +172,7 @@ namespace
            Mat rot_mat(3, 3, CV_64F);
            Mat transl_vec(1, 3, CV_64F);

-            for (int iter = range.start; iter < range.end; ++iter)
+            for (int iter = range.begin(); iter < range.end(); ++iter)
            {
                selectRandom(subset_size, num_points, subset_indices);
                for (int i = 0; i < subset_size; ++i)
@@ -236,7 +238,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    // Generate set of hypotheses using small subsets of the input data
    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                      num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for_(Range(0, num_iters), body);
+    parallel_for(BlockedRange(0, num_iters), body);

    // Compute scores (i.e. number of inliers) for each hypothesis
    GpuMat d_object(object);
@@ -250,7 +252,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    // Find the best hypothesis index
    Point best_idx;
    double best_score;
-    gpu::minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
+    minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
    int num_inliers = static_cast<int>(best_score);

    // Extract the best hypothesis data
@@ -288,3 +290,5 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
 }

 #endif
+
+
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -41,6 +41,8 @@
 //M*/

 #include "precomp.hpp"
+#include <vector>
+#include <iostream>
 #include "opencv2/objdetect/objdetect_c.h"

 using namespace cv;
@@ -73,37 +75,6 @@ public:
    virtual bool read(const String& classifierAsXml) = 0;
 };

-#ifndef HAVE_OPENCV_GPULEGACY
-
-struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
-{
-public:
-    HaarCascade()
-    {
-        throw_no_cuda();
-    }
-
-    unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
-    {
-        throw_no_cuda();
-        return 0;
-    }
-
-    cv::Size getClassifierCvSize() const
-    {
-        throw_no_cuda();
-        return cv::Size();
-    }
-
-    bool read(const String&)
-    {
-        throw_no_cuda();
-        return false;
-    }
-};
-
-#else
-
 struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
 {
 public:
@@ -313,8 +284,6 @@ private:
    virtual ~HaarCascade(){}
 };

-#endif
-
 cv::Size operator -(const cv::Size& a, const cv::Size& b)
 {
    return cv::Size(a.width - b.width, a.height - b.height);
@@ -508,8 +477,6 @@ private:
            resuzeBuffer.create(frame, CV_8UC1);

            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
-
-#ifdef HAVE_OPENCV_GPULEGACY
            NcvSize32u roiSize;
            roiSize.width = frame.width;
            roiSize.height = frame.height;
@@ -520,7 +487,6 @@ private:
            Ncv32u bufSize;
            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
            integralBuffer.create(1, bufSize, CV_8UC1);
-#endif

            candidates.create(1 , frame.width >> 1, CV_32SC4);
        }
@@ -756,3 +722,240 @@ bool cv::gpu::CascadeClassifier_GPU::load(const String& filename)
 }

 #endif
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined (HAVE_CUDA)
+
+struct RectConvert
+{
+    Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }
+    NcvRect32u operator()(const Rect& nr) const
+    {
+        NcvRect32u rect;
+        rect.x = nr.x;
+        rect.y = nr.y;
+        rect.width = nr.width;
+        rect.height = nr.height;
+        return rect;
+    }
+};
+
+void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
+{
+    std::vector<Rect> rects(hypotheses.size());
+    std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
+
+    if (weights)
+    {
+        std::vector<int> weights_int;
+        weights_int.assign(weights->begin(), weights->end());
+        cv::groupRectangles(rects, weights_int, groupThreshold, eps);
+    }
+    else
+    {
+        cv::groupRectangles(rects, groupThreshold, eps);
+    }
+    std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());
+    hypotheses.resize(rects.size());
+}
+
+NCVStatus loadFromXML(const String &filename,
+                      HaarClassifierCascadeDescriptor &haar,
+                      std::vector<HaarStage64> &haarStages,
+                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
+                      std::vector<HaarFeature64> &haarFeatures)
+{
+    NCVStatus ncvStat;
+
+    haar.NumStages = 0;
+    haar.NumClassifierRootNodes = 0;
+    haar.NumClassifierTotalNodes = 0;
+    haar.NumFeatures = 0;
+    haar.ClassifierSize.width = 0;
+    haar.ClassifierSize.height = 0;
+    haar.bHasStumpsOnly = true;
+    haar.bNeedsTiltedII = false;
+    Ncv32u curMaxTreeDepth;
+
+    std::vector<char> xmlFileCont;
+
+    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
+    haarStages.resize(0);
+    haarClassifierNodes.resize(0);
+    haarFeatures.resize(0);
+
+    Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);
+    if (oldCascade.empty())
+    {
+        return NCV_HAAR_XML_LOADING_EXCEPTION;
+    }
+
+    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
+    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
+
+    int stagesCound = oldCascade->count;
+    for(int s = 0; s < stagesCound; ++s) // by stages
+    {
+        HaarStage64 curStage;
+        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
+
+        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
+
+        int treesCount = oldCascade->stage_classifier[s].count;
+        for(int t = 0; t < treesCount; ++t) // by trees
+        {
+            Ncv32u nodeId = 0;
+            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
+
+            int nodesCount = tree->count;
+            for(int n = 0; n < nodesCount; ++n)  //by features
+            {
+                CvHaarFeature* feature = &tree->haar_feature[n];
+
+                HaarClassifierNode128 curNode;
+                curNode.setThreshold(tree->threshold[n]);
+
+                NcvBool bIsLeftNodeLeaf = false;
+                NcvBool bIsRightNodeLeaf = false;
+
+                HaarClassifierNodeDescriptor32 nodeLeft;
+                if ( tree->left[n] <= 0 )
+                {
+                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
+                    ncvStat = nodeLeft.create(leftVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    bIsLeftNodeLeaf = true;
+                }
+                else
+                {
+                    Ncv32u leftNodeOffset = tree->left[n];
+                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setLeftNodeDesc(nodeLeft);
+
+                HaarClassifierNodeDescriptor32 nodeRight;
+                if ( tree->right[n] <= 0 )
+                {
+                    Ncv32f rightVal = tree->alpha[-tree->right[n]];
+                    ncvStat = nodeRight.create(rightVal);
+                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
+                    bIsRightNodeLeaf = true;
+                }
+                else
+                {
+                    Ncv32u rightNodeOffset = tree->right[n];
+                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
+                    haar.bHasStumpsOnly = false;
+                }
+                curNode.setRightNodeDesc(nodeRight);
+
+                Ncv32u tiltedVal = feature->tilted;
+                haar.bNeedsTiltedII = (tiltedVal != 0);
+
+                Ncv32u featureId = 0;
+                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
+                {
+                    Ncv32u rectX = feature->rect[l].r.x;
+                    Ncv32u rectY = feature->rect[l].r.y;
+                    Ncv32u rectWidth = feature->rect[l].r.width;
+                    Ncv32u rectHeight = feature->rect[l].r.height;
+
+                    Ncv32f rectWeight = feature->rect[l].weight;
+
+                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
+                        break;
+
+                    HaarFeature64 curFeature;
+                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
+                    curFeature.setWeight(rectWeight);
+                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                    haarFeatures.push_back(curFeature);
+
+                    featureId++;
+                }
+
+                HaarFeatureDescriptor32 tmpFeatureDesc;
+                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
+                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
+                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
+                curNode.setFeatureDesc(tmpFeatureDesc);
+
+                if (!nodeId)
+                {
+                    //root node
+                    haarClassifierNodes.push_back(curNode);
+                    curMaxTreeDepth = 1;
+                }
+                else
+                {
+                    //other node
+                    h_TmpClassifierNotRootNodes.push_back(curNode);
+                    curMaxTreeDepth++;
+                }
+
+                nodeId++;
+            }
+        }
+
+        curStage.setNumClassifierRootNodes(treesCount);
+        haarStages.push_back(curStage);
+    }
+
+    //fill in cascade stats
+    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
+    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
+    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
+    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
+
+    //merge root and leaf nodes in one classifiers array
+    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
+    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
+    {
+        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
+
+        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
+        if (!featureDesc.isLeftNodeLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
+        if (!featureDesc.isRightNodeLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
+    }
+
+    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
+    {
+        HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
+
+        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
+        if (!featureDesc.isLeftNodeLeaf())
+        {
+            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
+            nodeLeft.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
+
+        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
+        if (!featureDesc.isRightNodeLeaf())
+        {
+            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
+            nodeRight.create(newOffset);
+        }
+        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
+
+        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
+    }
+
+    return NCV_SUCCESS;
+}
+
+#endif /* HAVE_CUDA */
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@@ -0,0 +1,201 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * NV12ToARGB color space conversion CUDA kernel
+ *
+ * This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
+ * source and converts to output in ARGB format
+ */
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev {
+    namespace video_decoding
+    {
+        __constant__ uint constAlpha = ((uint)0xff << 24);
+
+        __constant__ float constHueColorSpaceMat[9];
+
+        void loadHueCSC(float hueCSC[9])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
+        }
+
+        __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
+        {
+            float luma, chromaCb, chromaCr;
+
+            // Prepare for hue adjustment
+            luma     = (float)yuvi[0];
+            chromaCb = (float)((int)yuvi[1] - 512.0f);
+            chromaCr = (float)((int)yuvi[2] - 512.0f);
+
+           // Convert YUV To RGB with hue adjustment
+           *red   = (luma     * constHueColorSpaceMat[0]) +
+                    (chromaCb * constHueColorSpaceMat[1]) +
+                    (chromaCr * constHueColorSpaceMat[2]);
+
+           *green = (luma     * constHueColorSpaceMat[3]) +
+                    (chromaCb * constHueColorSpaceMat[4]) +
+                    (chromaCr * constHueColorSpaceMat[5]);
+
+           *blue  = (luma     * constHueColorSpaceMat[6]) +
+                    (chromaCb * constHueColorSpaceMat[7]) +
+                    (chromaCr * constHueColorSpaceMat[8]);
+        }
+
+        __device__ uint RGBAPACK_10bit(float red, float green, float blue, uint alpha)
+        {
+            uint ARGBpixel = 0;
+
+            // Clamp final 10 bit results
+            red   = ::fmin(::fmax(red,   0.0f), 1023.f);
+            green = ::fmin(::fmax(green, 0.0f), 1023.f);
+            blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
+
+            // Convert to 8 bit unsigned integers per color component
+            ARGBpixel = (((uint)blue  >> 2) |
+                        (((uint)green >> 2) << 8)  |
+                        (((uint)red   >> 2) << 16) |
+                        (uint)alpha);
+
+            return ARGBpixel;
+        }
+
+        // CUDA kernel for outputing the final ARGB output from NV12
+
+        #define COLOR_COMPONENT_BIT_SIZE 10
+        #define COLOR_COMPONENT_MASK     0x3FF
+
+        __global__ void NV12ToARGB(uchar* srcImage, size_t nSourcePitch,
+                                   uint* dstImage, size_t nDestPitch,
+                                   uint width, uint height)
+        {
+            // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+            const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+            const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
+
+            if (x >= width || y >= height)
+                return;
+
+            // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+            // if we move to texture we could read 4 luminance values
+
+            uint yuv101010Pel[2];
+
+            yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
+            yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
+
+            const size_t chromaOffset = nSourcePitch * height;
+
+            const int y_chroma = y >> 1;
+
+            if (y & 1)  // odd scanline ?
+            {
+                uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
+                uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
+
+                if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+                {
+                    chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
+                    chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
+                }
+
+                yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+                yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+            }
+            else
+            {
+                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+            }
+
+            // this steps performs the color conversion
+            uint yuvi[6];
+            float red[2], green[2], blue[2];
+
+            yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
+            yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+            yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+            yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
+            yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+            yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+            // YUV to RGB Transformation conversion
+            YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
+            YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+            // Clamp the results to RGBA
+
+            const size_t dstImagePitch = nDestPitch >> 2;
+
+            dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
+            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
+        }
+
+        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
+
+            NV12ToARGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
+                interopFrame.cols, interopFrame.rows);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -0,0 +1,774 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace bf_match
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Reduction
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
+        {
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
+        }
+
+        template <int BLOCK_SIZE>
+        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
+        {
+            s_distance += threadIdx.y * BLOCK_SIZE;
+            s_trainIdx += threadIdx.y * BLOCK_SIZE;
+            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
+
+            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled Cached
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
+        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
+        {
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                           float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < train.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
+                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
+
+            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                 cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                #pragma unroll
+                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                           const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
+                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Mask m = mask;
+
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                           const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                           cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
+                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
+                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
+        {
+            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
+            {
+                Dist dist;
+
+                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+                {
+                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                    if (loadX < query.cols)
+                    {
+                        T val;
+
+                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                    }
+
+                    __syncthreads();
+
+                    #pragma unroll
+                    for (int j = 0; j < BLOCK_SIZE; ++j)
+                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                    __syncthreads();
+                }
+
+                typename Dist::result_type distVal = dist;
+
+                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
+
+                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
+                {
+                    bestImgIdx = imgIdx;
+                    bestDistance = distVal;
+                    bestTrainIdx = trainIdx;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                   const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
+                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
+
+            float myBestDistance = numeric_limits<float>::max();
+            int myBestTrainIdx = -1;
+            int myBestImgIdx = -1;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Mask m = mask;
+            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
+            {
+                const PtrStepSz<T> train = trains[imgIdx];
+                m.next();
+                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
+            }
+
+            __syncthreads();
+
+            float* s_distance = (float*)(smem);
+            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
+
+            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
+
+            if (queryIdx < query.rows && threadIdx.x == 0)
+            {
+                bestTrainIdx[queryIdx] = myBestTrainIdx;
+                bestImgIdx[queryIdx] = myBestImgIdx;
+                bestDistance[queryIdx] = myBestDistance;
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                   const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                   cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, mask, trainIdx, distance, stream);
+            }
+        }
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                               cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
+                    trainIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                               cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
+                    trainIdx, distance,
+                    stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
+                                                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
+                                                    cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
+                    trainIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
+                    trainIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                                cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                               cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
+                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
+                                                    cudaStream_t stream)
+        {
+            if (masks.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
+                    trainIdx, imgIdx, distance,
+                    stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
+    } // namespace bf_match
+}}} // namespace cv { namespace gpu { namespace cudev {
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -0,0 +1,463 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_distance.hpp"
+#include "opencv2/core/cuda/datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace bf_radius_match
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+        }
+
+        template <typename Dist, typename T>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             cudaStream_t stream)
+        {
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Radius Match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
+    } // namespace bf_radius_match
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@@ -0,0 +1,258 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace gpu { namespace cudev {
+    namespace bgfg_gmg
+    {
+        __constant__ int   c_width;
+        __constant__ int   c_height;
+        __constant__ float c_minVal;
+        __constant__ float c_maxVal;
+        __constant__ int   c_quantizationLevels;
+        __constant__ float c_backgroundPrior;
+        __constant__ float c_decisionThreshold;
+        __constant__ int   c_maxFeatures;
+        __constant__ int   c_numInitializationFrames;
+
+        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
+                           float decisionThreshold, int maxFeatures, int numInitializationFrames)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_width, &width, sizeof(width)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_height, &height, sizeof(height)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_minVal, &minVal, sizeof(minVal)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_maxVal, &maxVal, sizeof(maxVal)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_quantizationLevels, &quantizationLevels, sizeof(quantizationLevels)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_backgroundPrior, &backgroundPrior, sizeof(backgroundPrior)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_decisionThreshold, &decisionThreshold, sizeof(decisionThreshold)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_maxFeatures, &maxFeatures, sizeof(maxFeatures)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_numInitializationFrames, &numInitializationFrames, sizeof(numInitializationFrames)) );
+        }
+
+        __device__ float findFeature(const int color, const PtrStepi& colors, const PtrStepf& weights, const int x, const int y, const int nfeatures)
+        {
+            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+            {
+                if (color == colors(fy, x))
+                    return weights(fy, x);
+            }
+
+            // not in histogram, so return 0.
+            return 0.0f;
+        }
+
+        __device__ void normalizeHistogram(PtrStepf weights, const int x, const int y, const int nfeatures)
+        {
+            float total = 0.0f;
+            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                total += weights(fy, x);
+
+            if (total != 0.0f)
+            {
+                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                    weights(fy, x) /= total;
+            }
+        }
+
+        __device__ bool insertFeature(const int color, const float weight, PtrStepi colors, PtrStepf weights, const int x, const int y, int& nfeatures)
+        {
+            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+            {
+                if (color == colors(fy, x))
+                {
+                    // feature in histogram
+
+                    weights(fy, x) += weight;
+
+                    return false;
+                }
+            }
+
+            if (nfeatures == c_maxFeatures)
+            {
+                // discard oldest feature
+
+                int idx = -1;
+                float minVal = numeric_limits<float>::max();
+                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                {
+                    const float w = weights(fy, x);
+                    if (w < minVal)
+                    {
+                        minVal = w;
+                        idx = fy;
+                    }
+                }
+
+                colors(idx, x) = color;
+                weights(idx, x) = weight;
+
+                return false;
+            }
+
+            colors(nfeatures * c_height + y, x) = color;
+            weights(nfeatures * c_height + y, x) = weight;
+
+            ++nfeatures;
+
+            return true;
+        }
+
+        namespace detail
+        {
+            template <int cn> struct Quantization
+            {
+                template <typename T>
+                __device__ static int apply(const T& val)
+                {
+                    int res = 0;
+                    res |= static_cast<int>((val.x - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
+                    res |= static_cast<int>((val.y - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 8;
+                    res |= static_cast<int>((val.z - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 16;
+                    return res;
+                }
+            };
+
+            template <> struct Quantization<1>
+            {
+                template <typename T>
+                __device__ static int apply(T val)
+                {
+                    return static_cast<int>((val - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
+                }
+            };
+        }
+
+        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};
+
+        template <typename SrcT>
+        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
+                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= c_width || y >= c_height)
+                return;
+
+            const SrcT pix = frame(y, x);
+            const int newFeatureColor = Quantization<SrcT>::apply(pix);
+
+            int nfeatures = nfeatures_(y, x);
+
+            if (frameNum >= c_numInitializationFrames)
+            {
+                // typical operation
+
+                const float weight = findFeature(newFeatureColor, colors_, weights_, x, y, nfeatures);
+
+                // see Godbehere, Matsukawa, Goldberg (2012) for reasoning behind this implementation of Bayes rule
+                const float posterior = (weight * c_backgroundPrior) / (weight * c_backgroundPrior + (1.0f - weight) * (1.0f - c_backgroundPrior));
+
+                const bool isForeground = ((1.0f - posterior) > c_decisionThreshold);
+                fgmask(y, x) = (uchar)(-isForeground);
+
+                // update histogram.
+
+                if (updateBackgroundModel)
+                {
+                    for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
+                        weights_(fy, x) *= 1.0f - learningRate;
+
+                    bool inserted = insertFeature(newFeatureColor, learningRate, colors_, weights_, x, y, nfeatures);
+
+                    if (inserted)
+                    {
+                        normalizeHistogram(weights_, x, y, nfeatures);
+                        nfeatures_(y, x) = nfeatures;
+                    }
+                }
+            }
+            else if (updateBackgroundModel)
+            {
+                // training-mode update
+
+                insertFeature(newFeatureColor, 1.0f, colors_, weights_, x, y, nfeatures);
+
+                if (frameNum == c_numInitializationFrames - 1)
+                    normalizeHistogram(weights_, x, y, nfeatures);
+            }
+        }
+
+        template <typename SrcT>
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
+                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
+        {
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );
+
+            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+
+        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+
+        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@@ -0,0 +1,764 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace mog
+    {
+        ///////////////////////////////////////////////////////////////
+        // Utility
+
+        __device__ __forceinline__ float cvt(uchar val)
+        {
+            return val;
+        }
+        __device__ __forceinline__ float3 cvt(const uchar3& val)
+        {
+            return make_float3(val.x, val.y, val.z);
+        }
+        __device__ __forceinline__ float4 cvt(const uchar4& val)
+        {
+            return make_float4(val.x, val.y, val.z, val.w);
+        }
+
+        __device__ __forceinline__ float sqr(float val)
+        {
+            return val * val;
+        }
+        __device__ __forceinline__ float sqr(const float3& val)
+        {
+            return val.x * val.x + val.y * val.y + val.z * val.z;
+        }
+        __device__ __forceinline__ float sqr(const float4& val)
+        {
+            return val.x * val.x + val.y * val.y + val.z * val.z;
+        }
+
+        __device__ __forceinline__ float sum(float val)
+        {
+            return val;
+        }
+        __device__ __forceinline__ float sum(const float3& val)
+        {
+            return val.x + val.y + val.z;
+        }
+        __device__ __forceinline__ float sum(const float4& val)
+        {
+            return val.x + val.y + val.z;
+        }
+
+        __device__ __forceinline__ float clamp(float var, float learningRate, float diff, float minVar)
+        {
+             return ::fmaxf(var + learningRate * (diff * diff - var), minVar);
+        }
+        __device__ __forceinline__ float3 clamp(const float3& var, float learningRate, const float3& diff, float minVar)
+        {
+             return make_float3(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
+                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
+                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar));
+        }
+        __device__ __forceinline__ float4 clamp(const float4& var, float learningRate, const float4& diff, float minVar)
+        {
+             return make_float4(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
+                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
+                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar),
+                                0.0f);
+        }
+
+        template <class Ptr2D>
+        __device__ __forceinline__ void swap(Ptr2D& ptr, int x, int y, int k, int rows)
+        {
+            typename Ptr2D::elem_type val = ptr(k * rows + y, x);
+            ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
+            ptr((k + 1) * rows + y, x) = val;
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG without learning
+
+        template <typename SrcT, typename WorkT>
+        __global__ void mog_withoutLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
+                                            const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, const PtrStep<WorkT> gmm_var,
+                                            const int nmixtures, const float varThreshold, const float backgroundRatio)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= frame.cols || y >= frame.rows)
+                return;
+
+            WorkT pix = cvt(frame(y, x));
+
+            int kHit = -1;
+            int kForeground = -1;
+
+            for (int k = 0; k < nmixtures; ++k)
+            {
+                if (gmm_weight(k * frame.rows + y, x) < numeric_limits<float>::epsilon())
+                    break;
+
+                WorkT mu = gmm_mean(k * frame.rows + y, x);
+                WorkT var = gmm_var(k * frame.rows + y, x);
+
+                WorkT diff = pix - mu;
+
+                if (sqr(diff) < varThreshold * sum(var))
+                {
+                    kHit = k;
+                    break;
+                }
+            }
+
+            if (kHit >= 0)
+            {
+                float wsum = 0.0f;
+                for (int k = 0; k < nmixtures; ++k)
+                {
+                    wsum += gmm_weight(k * frame.rows + y, x);
+
+                    if (wsum > backgroundRatio)
+                    {
+                        kForeground = k + 1;
+                        break;
+                    }
+                }
+            }
+
+            fgmask(y, x) = (uchar) (-(kHit < 0 || kHit >= kForeground));
+        }
+
+        template <typename SrcT, typename WorkT>
+        void mog_withoutLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var,
+                                        int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(mog_withoutLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
+                                                                         weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
+                                                                         nmixtures, varThreshold, backgroundRatio);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG with learning
+
+        template <typename SrcT, typename WorkT>
+        __global__ void mog_withLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
+                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep<WorkT> gmm_mean, PtrStep<WorkT> gmm_var,
+                                         const int nmixtures, const float varThreshold, const float backgroundRatio, const float learningRate, const float minVar)
+        {
+            const float w0 = 0.05f;
+            const float sk0 = w0 / (30.0f * 0.5f * 2.0f);
+            const float var0 = 30.0f * 0.5f * 30.0f * 0.5f * 4.0f;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= frame.cols || y >= frame.rows)
+                return;
+
+            WorkT pix = cvt(frame(y, x));
+
+            float wsum = 0.0f;
+            int kHit = -1;
+            int kForeground = -1;
+
+            int k = 0;
+            for (; k < nmixtures; ++k)
+            {
+                float w = gmm_weight(k * frame.rows + y, x);
+                wsum += w;
+
+                if (w < numeric_limits<float>::epsilon())
+                    break;
+
+                WorkT mu = gmm_mean(k * frame.rows + y, x);
+                WorkT var = gmm_var(k * frame.rows + y, x);
+
+                WorkT diff = pix - mu;
+
+                if (sqr(diff) < varThreshold * sum(var))
+                {
+                    wsum -= w;
+                    float dw = learningRate * (1.0f - w);
+
+                    var = clamp(var, learningRate, diff, minVar);
+
+                    float sortKey_prev = w / ::sqrtf(sum(var));
+                    gmm_sortKey(k * frame.rows + y, x) = sortKey_prev;
+
+                    float weight_prev = w + dw;
+                    gmm_weight(k * frame.rows + y, x) = weight_prev;
+
+                    WorkT mean_prev = mu + learningRate * diff;
+                    gmm_mean(k * frame.rows + y, x) = mean_prev;
+
+                    WorkT var_prev = var;
+                    gmm_var(k * frame.rows + y, x) = var_prev;
+
+                    int k1 = k - 1;
+
+                    if (k1 >= 0)
+                    {
+                        float sortKey_next = gmm_sortKey(k1 * frame.rows + y, x);
+                        float weight_next = gmm_weight(k1 * frame.rows + y, x);
+                        WorkT mean_next = gmm_mean(k1 * frame.rows + y, x);
+                        WorkT var_next = gmm_var(k1 * frame.rows + y, x);
+
+                        for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
+                        {
+                            gmm_sortKey(k1 * frame.rows + y, x) = sortKey_prev;
+                            gmm_sortKey((k1 + 1) * frame.rows + y, x) = sortKey_next;
+
+                            gmm_weight(k1 * frame.rows + y, x) = weight_prev;
+                            gmm_weight((k1 + 1) * frame.rows + y, x) = weight_next;
+
+                            gmm_mean(k1 * frame.rows + y, x) = mean_prev;
+                            gmm_mean((k1 + 1) * frame.rows + y, x) = mean_next;
+
+                            gmm_var(k1 * frame.rows + y, x) = var_prev;
+                            gmm_var((k1 + 1) * frame.rows + y, x) = var_next;
+
+                            sortKey_prev = sortKey_next;
+                            sortKey_next = k1 > 0 ? gmm_sortKey((k1 - 1) * frame.rows + y, x) : 0.0f;
+
+                            weight_prev = weight_next;
+                            weight_next = k1 > 0 ? gmm_weight((k1 - 1) * frame.rows + y, x) : 0.0f;
+
+                            mean_prev = mean_next;
+                            mean_next = k1 > 0 ? gmm_mean((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
+
+                            var_prev = var_next;
+                            var_next = k1 > 0 ? gmm_var((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
+                        }
+                    }
+
+                    kHit = k1 + 1;
+                    break;
+                }
+            }
+
+            if (kHit < 0)
+            {
+                // no appropriate gaussian mixture found at all, remove the weakest mixture and create a new one
+                kHit = k = ::min(k, nmixtures - 1);
+                wsum += w0 - gmm_weight(k * frame.rows + y, x);
+
+                gmm_weight(k * frame.rows + y, x) = w0;
+                gmm_mean(k * frame.rows + y, x) = pix;
+                gmm_var(k * frame.rows + y, x) = VecTraits<WorkT>::all(var0);
+                gmm_sortKey(k * frame.rows + y, x) = sk0;
+            }
+            else
+            {
+                for( ; k < nmixtures; k++)
+                    wsum += gmm_weight(k * frame.rows + y, x);
+            }
+
+            float wscale = 1.0f / wsum;
+            wsum = 0;
+            for (k = 0; k < nmixtures; ++k)
+            {
+                float w = gmm_weight(k * frame.rows + y, x);
+                wsum += w *= wscale;
+
+                gmm_weight(k * frame.rows + y, x) = w;
+                gmm_sortKey(k * frame.rows + y, x) *= wscale;
+
+                if (wsum > backgroundRatio && kForeground < 0)
+                    kForeground = k + 1;
+            }
+
+            fgmask(y, x) = (uchar)(-(kHit >= kForeground));
+        }
+
+        template <typename SrcT, typename WorkT>
+        void mog_withLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
+                                     int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar,
+                                     cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(mog_withLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
+                                                                      weight, sortKey, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
+                                                                      nmixtures, varThreshold, backgroundRatio, learningRate, minVar);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG
+
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
+        {
+            typedef void (*withoutLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
+            typedef void (*withLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);
+
+            static const withoutLearning_t withoutLearning[] =
+            {
+                0, mog_withoutLearning_caller<uchar, float>, 0, mog_withoutLearning_caller<uchar3, float3>, mog_withoutLearning_caller<uchar4, float4>
+            };
+            static const withLearning_t withLearning[] =
+            {
+                0, mog_withLearning_caller<uchar, float>, 0, mog_withLearning_caller<uchar3, float3>, mog_withLearning_caller<uchar4, float4>
+            };
+
+            const float minVar = noiseSigma * noiseSigma;
+
+            if (learningRate > 0.0f)
+                withLearning[cn](frame, fgmask, weight, sortKey, mean, var, nmixtures, varThreshold, backgroundRatio, learningRate, minVar, stream);
+            else
+                withoutLearning[cn](frame, fgmask, weight, mean, var, nmixtures, varThreshold, backgroundRatio, stream);
+        }
+
+        template <typename WorkT, typename OutT>
+        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStepSz<OutT> dst, const int nmixtures, const float backgroundRatio)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= dst.cols || y >= dst.rows)
+                return;
+
+            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
+            float totalWeight = 0.0f;
+
+            for (int mode = 0; mode < nmixtures; ++mode)
+            {
+                float weight = gmm_weight(mode * dst.rows + y, x);
+
+                WorkT mean = gmm_mean(mode * dst.rows + y, x);
+                meanVal = meanVal + weight * mean;
+
+                totalWeight += weight;
+
+                if(totalWeight > backgroundRatio)
+                    break;
+            }
+
+            meanVal = meanVal * (1.f / totalWeight);
+
+            dst(y, x) = saturate_cast<OutT>(meanVal);
+        }
+
+        template <typename WorkT, typename OutT>
+        void getBackgroundImage_caller(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage<WorkT, OutT>, cudaFuncCachePreferL1) );
+
+            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst, nmixtures, backgroundRatio);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+
+            static const func_t funcs[] =
+            {
+                0, getBackgroundImage_caller<float, uchar>, 0, getBackgroundImage_caller<float3, uchar3>, getBackgroundImage_caller<float4, uchar4>
+            };
+
+            funcs[cn](weight, mean, dst, nmixtures, backgroundRatio, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////
+        // MOG2
+
+        __constant__ int           c_nmixtures;
+        __constant__ float         c_Tb;
+        __constant__ float         c_TB;
+        __constant__ float         c_Tg;
+        __constant__ float         c_varInit;
+        __constant__ float         c_varMin;
+        __constant__ float         c_varMax;
+        __constant__ float         c_tau;
+        __constant__ unsigned char c_shadowVal;
+
+        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
+        {
+            varMin = ::fminf(varMin, varMax);
+            varMax = ::fmaxf(varMin, varMax);
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_nmixtures, &nmixtures, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_Tb, &Tb, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_TB, &TB, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_Tg, &Tg, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_varInit, &varInit, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_varMin, &varMin, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_varMax, &varMax, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_tau, &tau, sizeof(float)) );
+            cudaSafeCall( cudaMemcpyToSymbol(c_shadowVal, &shadowVal, sizeof(unsigned char)) );
+        }
+
+        template <bool detectShadows, typename SrcT, typename WorkT>
+        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
+                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
+                             const float alphaT, const float alpha1, const float prune)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= frame.cols || y >= frame.rows)
+                return;
+
+            WorkT pix = cvt(frame(y, x));
+
+            //calculate distances to the modes (+ sort)
+            //here we need to go in descending order!!!
+
+            bool background = false; // true - the pixel classified as background
+
+            //internal:
+
+            bool fitsPDF = false; //if it remains zero a new GMM mode will be added
+
+            int nmodes = modesUsed(y, x);
+            int nNewModes = nmodes; //current number of modes in GMM
+
+            float totalWeight = 0.0f;
+
+            //go through all modes
+
+            for (int mode = 0; mode < nmodes; ++mode)
+            {
+                //need only weight if fit is found
+                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
+
+                //fit not found yet
+                if (!fitsPDF)
+                {
+                    //check if it belongs to some of the remaining modes
+                    float var = gmm_variance(mode * frame.rows + y, x);
+
+                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
+
+                    //calculate difference and distance
+                    WorkT diff = mean - pix;
+                    float dist2 = sqr(diff);
+
+                    //background? - Tb - usually larger than Tg
+                    if (totalWeight < c_TB && dist2 < c_Tb * var)
+                        background = true;
+
+                    //check fit
+                    if (dist2 < c_Tg * var)
+                    {
+                        //belongs to the mode
+                        fitsPDF = true;
+
+                        //update distribution
+
+                        //update weight
+                        weight += alphaT;
+                        float k = alphaT / weight;
+
+                        //update mean
+                        gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
+
+                        //update variance
+                        float varnew = var + k * (dist2 - var);
+
+                        //limit the variance
+                        varnew = ::fmaxf(varnew, c_varMin);
+                        varnew = ::fminf(varnew, c_varMax);
+
+                        gmm_variance(mode * frame.rows + y, x) = varnew;
+
+                        //sort
+                        //all other weights are at the same place and
+                        //only the matched (iModes) is higher -> just find the new place for it
+
+                        for (int i = mode; i > 0; --i)
+                        {
+                            //check one up
+                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
+                                break;
+
+                            //swap one up
+                            swap(gmm_weight, x, y, i - 1, frame.rows);
+                            swap(gmm_variance, x, y, i - 1, frame.rows);
+                            swap(gmm_mean, x, y, i - 1, frame.rows);
+                        }
+
+                        //belongs to the mode - bFitsPDF becomes 1
+                    }
+                } // !fitsPDF
+
+                //check prune
+                if (weight < -prune)
+                {
+                    weight = 0.0;
+                    nmodes--;
+                }
+
+                gmm_weight(mode * frame.rows + y, x) = weight; //update weight by the calculated value
+                totalWeight += weight;
+            }
+
+            //renormalize weights
+
+            totalWeight = 1.f / totalWeight;
+            for (int mode = 0; mode < nmodes; ++mode)
+                gmm_weight(mode * frame.rows + y, x) *= totalWeight;
+
+            nmodes = nNewModes;
+
+            //make new mode if needed and exit
+
+            if (!fitsPDF)
+            {
+                // replace the weakest or add a new one
+                int mode = nmodes == c_nmixtures ? c_nmixtures - 1 : nmodes++;
+
+                if (nmodes == 1)
+                    gmm_weight(mode * frame.rows + y, x) = 1.f;
+                else
+                {
+                    gmm_weight(mode * frame.rows + y, x) = alphaT;
+
+                    // renormalize all other weights
+
+                    for (int i = 0; i < nmodes - 1; ++i)
+                        gmm_weight(i * frame.rows + y, x) *= alpha1;
+                }
+
+                // init
+
+                gmm_mean(mode * frame.rows + y, x) = pix;
+                gmm_variance(mode * frame.rows + y, x) = c_varInit;
+
+                //sort
+                //find the new place for it
+
+                for (int i = nmodes - 1; i > 0; --i)
+                {
+                    // check one up
+                    if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
+                        break;
+
+                    //swap one up
+                    swap(gmm_weight, x, y, i - 1, frame.rows);
+                    swap(gmm_variance, x, y, i - 1, frame.rows);
+                    swap(gmm_mean, x, y, i - 1, frame.rows);
+                }
+            }
+
+            //set the number of modes
+            modesUsed(y, x) = nmodes;
+
+            bool isShadow = false;
+            if (detectShadows && !background)
+            {
+                float tWeight = 0.0f;
+
+                // check all the components  marked as background:
+                for (int mode = 0; mode < nmodes; ++mode)
+                {
+                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
+
+                    WorkT pix_mean = pix * mean;
+
+                    float numerator = sum(pix_mean);
+                    float denominator = sqr(mean);
+
+                    // no division by zero allowed
+                    if (denominator == 0)
+                        break;
+
+                    // if tau < a < 1 then also check the color distortion
+                    if (numerator <= denominator && numerator >= c_tau * denominator)
+                    {
+                        float a = numerator / denominator;
+
+                        WorkT dD = a * mean - pix;
+
+                        if (sqr(dD) < c_Tb * gmm_variance(mode * frame.rows + y, x) * a * a)
+                        {
+                            isShadow = true;
+                            break;
+                        }
+                    };
+
+                    tWeight += gmm_weight(mode * frame.rows + y, x);
+                    if (tWeight > c_TB)
+                        break;
+                }
+            }
+
+            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
+        }
+
+        template <typename SrcT, typename WorkT>
+        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
+                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
+
+            const float alpha1 = 1.0f - alphaT;
+
+            if (detectShadows)
+            {
+                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
+                                                                    alphaT, alpha1, prune);
+            }
+            else
+            {
+                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );
+
+                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
+                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
+                                                                    alphaT, alpha1, prune);
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
+                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+
+            static const func_t funcs[] =
+            {
+                0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>
+            };
+
+            funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, stream);
+        }
+
+        template <typename WorkT, typename OutT>
+        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= modesUsed.cols || y >= modesUsed.rows)
+                return;
+
+            int nmodes = modesUsed(y, x);
+
+            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
+            float totalWeight = 0.0f;
+
+            for (int mode = 0; mode < nmodes; ++mode)
+            {
+                float weight = gmm_weight(mode * modesUsed.rows + y, x);
+
+                WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
+                meanVal = meanVal + weight * mean;
+
+                totalWeight += weight;
+
+                if(totalWeight > c_TB)
+                    break;
+            }
+
+            meanVal = meanVal * (1.f / totalWeight);
+
+            dst(y, x) = saturate_cast<OutT>(meanVal);
+        }
+
+        template <typename WorkT, typename OutT>
+        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );
+
+            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
+        {
+            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
+
+            static const func_t funcs[] =
+            {
+                0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>
+            };
+
+            funcs[cn](modesUsed, weight, mean, dst, stream);
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -0,0 +1,199 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::gpu;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+/// Bilateral filtering
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
+        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
+        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
+        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
+
+        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
+
+        template<typename T, typename B>
+        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            value_type center = saturate_cast<value_type>(src(y, x));
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0;
+
+            int r = ksz / 2;
+            float r2 = (float)(r * r);
+
+            int tx = x - r + ksz;
+            int ty = y - r + ksz;
+
+            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(src(cy, cx));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            else
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            dst(y, x) = saturate_cast<T>(sum1 / sum2);
+        }
+
+        template<typename T, template <typename> class B>
+        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
+             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
+
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdReflect101>,
+                bilateral_caller<T, BrdReplicate>,
+                bilateral_caller<T, BrdConstant>,
+                bilateral_caller<T, BrdReflect>,
+                bilateral_caller<T, BrdWrap>,
+            };
+            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+        }
+    }
+}}}
+
+
+#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
+    template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(short)
+//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
+OCV_INSTANTIATE_BILATERAL_FILTER(short3)
+OCV_INSTANTIATE_BILATERAL_FILTER(short4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
+//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(int)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(float)
+//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
+OCV_INSTANTIATE_BILATERAL_FILTER(float3)
+OCV_INSTANTIATE_BILATERAL_FILTER(float4)
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace blend
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }
+
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -0,0 +1,494 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <utility>
+#include <algorithm>//std::swap
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace canny
+{
+    struct L1 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::abs(x) + ::abs(y);
+        }
+
+        __device__ __forceinline__ L1() {}
+        __device__ __forceinline__ L1(const L1&) {}
+    };
+    struct L2 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::sqrtf(x * x + y * y);
+        }
+
+        __device__ __forceinline__ L2() {}
+        __device__ __forceinline__ L2(const L2&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        const int xoff;
+        const int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
+    };
+
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bindTexture(&tex_src, srcWhole);
+        SrcTex src(xoff, yoff);
+
+        if (L2Grad)
+        {
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+        else
+        {
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        if (L2Grad)
+        {
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+        else
+        {
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
+
+        bindTexture(&tex_mag, mag);
+
+        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __device__ int counter = 0;
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    {
+        __shared__ volatile int smem[18][18];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+
+        __syncthreads();
+
+        if (x >= map.cols || y >= map.rows)
+            return;
+
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+        }
+
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+        map(y, x) = e;
+
+        n = 0;
+
+        if (e == 2)
+        {
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+        }
+
+        if (n > 0)
+        {
+            const int ind =  ::atomicAdd(&counter, 1);
+            st[ind] = make_ushort2(x, y);
+        }
+    }
+
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
+
+        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    {
+        const int stack_size = 512;
+
+        __shared__ int s_counter;
+        __shared__ int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind >= count)
+            return;
+
+        ushort2 pos = st1[ind];
+
+        if (threadIdx.x < 8)
+        {
+            pos.x += c_dx[threadIdx.x];
+            pos.y += c_dy[threadIdx.x];
+
+            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            {
+                map(pos.y, pos.x) = 2;
+
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                s_st[ind] = pos;
+            }
+        }
+
+        __syncthreads();
+
+        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+        {
+            const int subTaskIdx = threadIdx.x >> 3;
+            const int portion = ::min(s_counter, blockDim.x >> 3);
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+                s_counter -= portion;
+
+            __syncthreads();
+
+            if (subTaskIdx < portion)
+            {
+                pos.x += c_dx[threadIdx.x & 7];
+                pos.y += c_dy[threadIdx.x & 7];
+
+                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+                {
+                    map(pos.y, pos.x) = 2;
+
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                    s_st[ind] = pos;
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if (s_counter > 0)
+        {
+            if (threadIdx.x == 0)
+            {
+                ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ind - s_counter;
+            }
+
+            __syncthreads();
+
+            ind = s_ind;
+
+            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                st2[ind + i] = s_st[i];
+        }
+    }
+
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
+
+        int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
+        {
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
+
+            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            std::swap(st1, st2);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
+        }
+
+        __device__ __forceinline__ GetEdges() {}
+        __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), 0);
+    }
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/clahe.cu
+++ b/modules/gpu/src/cuda/clahe.cu
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace clahe
+{
+    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
+                                  const int2 tileSize, const int tilesX,
+                                  const int clipLimit, const float lutScale)
+    {
+        __shared__ int smem[512];
+
+        const int tx = blockIdx.x;
+        const int ty = blockIdx.y;
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        smem[tid] = 0;
+        __syncthreads();
+
+        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
+        {
+            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
+            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
+            {
+                const int data = srcPtr[j];
+                Emulation::smem::atomicAdd(&smem[data], 1);
+            }
+        }
+
+        __syncthreads();
+
+        int tHistVal = smem[tid];
+
+        __syncthreads();
+
+        if (clipLimit > 0)
+        {
+            // clip histogram bar
+
+            int clipped = 0;
+            if (tHistVal > clipLimit)
+            {
+                clipped = tHistVal - clipLimit;
+                tHistVal = clipLimit;
+            }
+
+            // find number of overall clipped samples
+
+            reduce<256>(smem, clipped, tid, plus<int>());
+
+            // broadcast evaluated value
+
+            __shared__ int totalClipped;
+
+            if (tid == 0)
+                totalClipped = clipped;
+            __syncthreads();
+
+            // redistribute clipped samples evenly
+
+            int redistBatch = totalClipped / 256;
+            tHistVal += redistBatch;
+
+            int residual = totalClipped - redistBatch * 256;
+            if (tid < residual)
+                ++tHistVal;
+        }
+
+        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
+
+        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
+    }
+
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(tilesX, tilesY);
+
+        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
+        int ty1 = __float2int_rd(tyf);
+        int ty2 = ty1 + 1;
+        const float ya = tyf - ty1;
+        ty1 = ::max(ty1, 0);
+        ty2 = ::min(ty2, tilesY - 1);
+
+        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
+        int tx1 = __float2int_rd(txf);
+        int tx2 = tx1 + 1;
+        const float xa = txf - tx1;
+        tx1 = ::max(tx1, 0);
+        tx2 = ::min(tx2, tilesX - 1);
+
+        const int srcVal = src(y, x);
+
+        float res = 0;
+
+        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
+        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
+        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
+        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
+
+        dst(y, x) = saturate_cast<uchar>(res);
+    }
+
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
+
+        tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // CUDA_DISABLER
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -0,0 +1,461 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "cvt_color_internal.h"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
+    { \
+        traits::functor_type functor = traits::create_functor(); \
+        typedef typename traits::functor_type::argument_type src_t; \
+        typedef typename traits::functor_type::result_type   dst_t; \
+        cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.0.cu
+++ b/modules/gpu/src/cuda/column_filter.0.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.1.cu
+++ b/modules/gpu/src/cuda/column_filter.1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.2.cu
+++ b/modules/gpu/src/cuda/column_filter.2.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.5.cu
+++ b/modules/gpu/src/cuda/column_filter.5.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.6.cu
+++ b/modules/gpu/src/cuda/column_filter.6.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.7.cu
+++ b/modules/gpu/src/cuda/column_filter.7.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "column_filter.h"
+
+namespace filter
+{
+    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@@ -0,0 +1,372 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace column_filter
+{
+    #define MAX_KERNEL_SIZE 32
+
+    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+    template <int KSIZE, typename T, typename D, typename B>
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 16;
+            const int PATCH_PER_BLOCK = 4;
+            const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+        #else
+            const int BLOCK_DIM_X = 16;
+            const int BLOCK_DIM_Y = 8;
+            const int PATCH_PER_BLOCK = 2;
+            const int HALO_SIZE = 2;
+        #endif
+
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+        __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+
+        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+
+        if (x >= src.cols)
+            return;
+
+        const T* src_col = src.ptr() + x;
+
+        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+        if (blockIdx.y > 0)
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Upper halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        if (blockIdx.y + 2 < gridDim.y)
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
+        }
+        else
+        {
+            //Main data
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
+
+            //Lower halo
+            #pragma unroll
+            for (int j = 0; j < HALO_SIZE; ++j)
+                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
+        }
+
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+        {
+            const int y = yStart + j * BLOCK_DIM_Y;
+
+            if (y < src.rows)
+            {
+                sum_t sum = VecTraits<sum_t>::all(0);
+
+                #pragma unroll
+                for (int k = 0; k < KSIZE; ++k)
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+
+                dst(y, x) = saturate_cast<D>(sum);
+            }
+        }
+    }
+
+    template <int KSIZE, typename T, typename D, template<typename> class B>
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    {
+        int BLOCK_DIM_X;
+        int BLOCK_DIM_Y;
+        int PATCH_PER_BLOCK;
+
+        if (cc >= 20)
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 16;
+            PATCH_PER_BLOCK = 4;
+        }
+        else
+        {
+            BLOCK_DIM_X = 16;
+            BLOCK_DIM_Y = 8;
+            PATCH_PER_BLOCK = 2;
+        }
+
+        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
+
+        B<T> brd(src.rows);
+
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+namespace filter
+{
+    template <typename T, typename D>
+    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+        static const caller_t callers[5][33] =
+        {
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect101>,
+                column_filter::caller< 2, T, D, BrdColReflect101>,
+                column_filter::caller< 3, T, D, BrdColReflect101>,
+                column_filter::caller< 4, T, D, BrdColReflect101>,
+                column_filter::caller< 5, T, D, BrdColReflect101>,
+                column_filter::caller< 6, T, D, BrdColReflect101>,
+                column_filter::caller< 7, T, D, BrdColReflect101>,
+                column_filter::caller< 8, T, D, BrdColReflect101>,
+                column_filter::caller< 9, T, D, BrdColReflect101>,
+                column_filter::caller<10, T, D, BrdColReflect101>,
+                column_filter::caller<11, T, D, BrdColReflect101>,
+                column_filter::caller<12, T, D, BrdColReflect101>,
+                column_filter::caller<13, T, D, BrdColReflect101>,
+                column_filter::caller<14, T, D, BrdColReflect101>,
+                column_filter::caller<15, T, D, BrdColReflect101>,
+                column_filter::caller<16, T, D, BrdColReflect101>,
+                column_filter::caller<17, T, D, BrdColReflect101>,
+                column_filter::caller<18, T, D, BrdColReflect101>,
+                column_filter::caller<19, T, D, BrdColReflect101>,
+                column_filter::caller<20, T, D, BrdColReflect101>,
+                column_filter::caller<21, T, D, BrdColReflect101>,
+                column_filter::caller<22, T, D, BrdColReflect101>,
+                column_filter::caller<23, T, D, BrdColReflect101>,
+                column_filter::caller<24, T, D, BrdColReflect101>,
+                column_filter::caller<25, T, D, BrdColReflect101>,
+                column_filter::caller<26, T, D, BrdColReflect101>,
+                column_filter::caller<27, T, D, BrdColReflect101>,
+                column_filter::caller<28, T, D, BrdColReflect101>,
+                column_filter::caller<29, T, D, BrdColReflect101>,
+                column_filter::caller<30, T, D, BrdColReflect101>,
+                column_filter::caller<31, T, D, BrdColReflect101>,
+                column_filter::caller<32, T, D, BrdColReflect101>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReplicate>,
+                column_filter::caller< 2, T, D, BrdColReplicate>,
+                column_filter::caller< 3, T, D, BrdColReplicate>,
+                column_filter::caller< 4, T, D, BrdColReplicate>,
+                column_filter::caller< 5, T, D, BrdColReplicate>,
+                column_filter::caller< 6, T, D, BrdColReplicate>,
+                column_filter::caller< 7, T, D, BrdColReplicate>,
+                column_filter::caller< 8, T, D, BrdColReplicate>,
+                column_filter::caller< 9, T, D, BrdColReplicate>,
+                column_filter::caller<10, T, D, BrdColReplicate>,
+                column_filter::caller<11, T, D, BrdColReplicate>,
+                column_filter::caller<12, T, D, BrdColReplicate>,
+                column_filter::caller<13, T, D, BrdColReplicate>,
+                column_filter::caller<14, T, D, BrdColReplicate>,
+                column_filter::caller<15, T, D, BrdColReplicate>,
+                column_filter::caller<16, T, D, BrdColReplicate>,
+                column_filter::caller<17, T, D, BrdColReplicate>,
+                column_filter::caller<18, T, D, BrdColReplicate>,
+                column_filter::caller<19, T, D, BrdColReplicate>,
+                column_filter::caller<20, T, D, BrdColReplicate>,
+                column_filter::caller<21, T, D, BrdColReplicate>,
+                column_filter::caller<22, T, D, BrdColReplicate>,
+                column_filter::caller<23, T, D, BrdColReplicate>,
+                column_filter::caller<24, T, D, BrdColReplicate>,
+                column_filter::caller<25, T, D, BrdColReplicate>,
+                column_filter::caller<26, T, D, BrdColReplicate>,
+                column_filter::caller<27, T, D, BrdColReplicate>,
+                column_filter::caller<28, T, D, BrdColReplicate>,
+                column_filter::caller<29, T, D, BrdColReplicate>,
+                column_filter::caller<30, T, D, BrdColReplicate>,
+                column_filter::caller<31, T, D, BrdColReplicate>,
+                column_filter::caller<32, T, D, BrdColReplicate>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColConstant>,
+                column_filter::caller< 2, T, D, BrdColConstant>,
+                column_filter::caller< 3, T, D, BrdColConstant>,
+                column_filter::caller< 4, T, D, BrdColConstant>,
+                column_filter::caller< 5, T, D, BrdColConstant>,
+                column_filter::caller< 6, T, D, BrdColConstant>,
+                column_filter::caller< 7, T, D, BrdColConstant>,
+                column_filter::caller< 8, T, D, BrdColConstant>,
+                column_filter::caller< 9, T, D, BrdColConstant>,
+                column_filter::caller<10, T, D, BrdColConstant>,
+                column_filter::caller<11, T, D, BrdColConstant>,
+                column_filter::caller<12, T, D, BrdColConstant>,
+                column_filter::caller<13, T, D, BrdColConstant>,
+                column_filter::caller<14, T, D, BrdColConstant>,
+                column_filter::caller<15, T, D, BrdColConstant>,
+                column_filter::caller<16, T, D, BrdColConstant>,
+                column_filter::caller<17, T, D, BrdColConstant>,
+                column_filter::caller<18, T, D, BrdColConstant>,
+                column_filter::caller<19, T, D, BrdColConstant>,
+                column_filter::caller<20, T, D, BrdColConstant>,
+                column_filter::caller<21, T, D, BrdColConstant>,
+                column_filter::caller<22, T, D, BrdColConstant>,
+                column_filter::caller<23, T, D, BrdColConstant>,
+                column_filter::caller<24, T, D, BrdColConstant>,
+                column_filter::caller<25, T, D, BrdColConstant>,
+                column_filter::caller<26, T, D, BrdColConstant>,
+                column_filter::caller<27, T, D, BrdColConstant>,
+                column_filter::caller<28, T, D, BrdColConstant>,
+                column_filter::caller<29, T, D, BrdColConstant>,
+                column_filter::caller<30, T, D, BrdColConstant>,
+                column_filter::caller<31, T, D, BrdColConstant>,
+                column_filter::caller<32, T, D, BrdColConstant>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColReflect>,
+                column_filter::caller< 2, T, D, BrdColReflect>,
+                column_filter::caller< 3, T, D, BrdColReflect>,
+                column_filter::caller< 4, T, D, BrdColReflect>,
+                column_filter::caller< 5, T, D, BrdColReflect>,
+                column_filter::caller< 6, T, D, BrdColReflect>,
+                column_filter::caller< 7, T, D, BrdColReflect>,
+                column_filter::caller< 8, T, D, BrdColReflect>,
+                column_filter::caller< 9, T, D, BrdColReflect>,
+                column_filter::caller<10, T, D, BrdColReflect>,
+                column_filter::caller<11, T, D, BrdColReflect>,
+                column_filter::caller<12, T, D, BrdColReflect>,
+                column_filter::caller<13, T, D, BrdColReflect>,
+                column_filter::caller<14, T, D, BrdColReflect>,
+                column_filter::caller<15, T, D, BrdColReflect>,
+                column_filter::caller<16, T, D, BrdColReflect>,
+                column_filter::caller<17, T, D, BrdColReflect>,
+                column_filter::caller<18, T, D, BrdColReflect>,
+                column_filter::caller<19, T, D, BrdColReflect>,
+                column_filter::caller<20, T, D, BrdColReflect>,
+                column_filter::caller<21, T, D, BrdColReflect>,
+                column_filter::caller<22, T, D, BrdColReflect>,
+                column_filter::caller<23, T, D, BrdColReflect>,
+                column_filter::caller<24, T, D, BrdColReflect>,
+                column_filter::caller<25, T, D, BrdColReflect>,
+                column_filter::caller<26, T, D, BrdColReflect>,
+                column_filter::caller<27, T, D, BrdColReflect>,
+                column_filter::caller<28, T, D, BrdColReflect>,
+                column_filter::caller<29, T, D, BrdColReflect>,
+                column_filter::caller<30, T, D, BrdColReflect>,
+                column_filter::caller<31, T, D, BrdColReflect>,
+                column_filter::caller<32, T, D, BrdColReflect>
+            },
+            {
+                0,
+                column_filter::caller< 1, T, D, BrdColWrap>,
+                column_filter::caller< 2, T, D, BrdColWrap>,
+                column_filter::caller< 3, T, D, BrdColWrap>,
+                column_filter::caller< 4, T, D, BrdColWrap>,
+                column_filter::caller< 5, T, D, BrdColWrap>,
+                column_filter::caller< 6, T, D, BrdColWrap>,
+                column_filter::caller< 7, T, D, BrdColWrap>,
+                column_filter::caller< 8, T, D, BrdColWrap>,
+                column_filter::caller< 9, T, D, BrdColWrap>,
+                column_filter::caller<10, T, D, BrdColWrap>,
+                column_filter::caller<11, T, D, BrdColWrap>,
+                column_filter::caller<12, T, D, BrdColWrap>,
+                column_filter::caller<13, T, D, BrdColWrap>,
+                column_filter::caller<14, T, D, BrdColWrap>,
+                column_filter::caller<15, T, D, BrdColWrap>,
+                column_filter::caller<16, T, D, BrdColWrap>,
+                column_filter::caller<17, T, D, BrdColWrap>,
+                column_filter::caller<18, T, D, BrdColWrap>,
+                column_filter::caller<19, T, D, BrdColWrap>,
+                column_filter::caller<20, T, D, BrdColWrap>,
+                column_filter::caller<21, T, D, BrdColWrap>,
+                column_filter::caller<22, T, D, BrdColWrap>,
+                column_filter::caller<23, T, D, BrdColWrap>,
+                column_filter::caller<24, T, D, BrdColWrap>,
+                column_filter::caller<25, T, D, BrdColWrap>,
+                column_filter::caller<26, T, D, BrdColWrap>,
+                column_filter::caller<27, T, D, BrdColWrap>,
+                column_filter::caller<28, T, D, BrdColWrap>,
+                column_filter::caller<29, T, D, BrdColWrap>,
+                column_filter::caller<30, T, D, BrdColWrap>,
+                column_filter::caller<31, T, D, BrdColWrap>,
+                column_filter::caller<32, T, D, BrdColWrap>
+            }
+        };
+
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+    }
+}
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -0,0 +1,131 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = src(y - top, x - left);
+        }
+
+        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+        {
+            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
+                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
+
+                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
+            const T* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type vec_type;
+
+            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+
+            static const caller_t callers[5] =
+            {
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
+            };
+
+            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+        }
+
+        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -0,0 +1,544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <typename T> struct Bayer2BGR;
+
+    template <> struct Bayer2BGR<uchar>
+    {
+        uchar3 res0;
+        uchar3 res1;
+        uchar3 res2;
+        uchar3 res3;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            uchar4 patch[3][3];
+            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
+
+                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
+                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
+
+                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
+                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+
+                    res2.x = t5;
+                    res2.y = patch[1][1].z;
+                    res2.z = t4;
+
+                    res3.x = patch[1][1].w;
+                    res3.y = t7;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+
+                    res2.x = t4;
+                    res2.y = patch[1][1].z;
+                    res2.z = t5;
+
+                    res3.x = t6;
+                    res3.y = t7;
+                    res3.z = patch[1][1].w;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
+
+                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
+                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
+
+                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
+                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+
+                    res2.x = patch[1][1].z;
+                    res2.y = t5;
+                    res2.z = t4;
+
+                    res3.x = t7;
+                    res3.y = patch[1][1].w;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+
+                    res2.x = t4;
+                    res2.y = t5;
+                    res2.z = patch[1][1].z;
+
+                    res3.x = t6;
+                    res3.y = patch[1][1].w;
+                    res3.z = t7;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
+    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
+    {
+        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
+    {
+        return make_uchar4(pix.x, pix.y, pix.z, 255);
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 2) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<uchar> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+        if (d_x + 2 < src.cols)
+            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
+        if (d_x + 3 < src.cols)
+            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
+    }
+
+    template <> struct Bayer2BGR<ushort>
+    {
+        ushort3 res0;
+        ushort3 res1;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            ushort2 patch[3][3];
+            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
+    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
+    {
+        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
+    {
+        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 1) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<ushort> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+    }
+
+    template <int cn>
+    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int cn>
+    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //
+    // by Morgan McGuire, Williams College
+    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
+    //
+    // ported to CUDA
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    template <typename DstType>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    {
+        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
+        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
+        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
+        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
+        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
+        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
+            return;
+
+        int2 center;
+        center.x = x + sourceOffset.x;
+        center.y = y + sourceOffset.y;
+
+        int4 xCoord;
+        xCoord.x = center.x - 2;
+        xCoord.y = center.x - 1;
+        xCoord.z = center.x + 1;
+        xCoord.w = center.x + 2;
+
+        int4 yCoord;
+        yCoord.x = center.y - 2;
+        yCoord.y = center.y - 1;
+        yCoord.z = center.y + 1;
+        yCoord.w = center.y + 2;
+
+        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+
+        float4 Dvec;
+        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+
+        float4 value;
+        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+
+        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
+        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+
+        float4 PATTERN;
+        PATTERN.x = kCx * C;
+        PATTERN.y = kCy * C;
+        PATTERN.z = kCz * C;
+        PATTERN.w = PATTERN.z;
+
+        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
+
+        // There are five filter patterns (identity, cross, checker,
+        // theta, phi). Precompute the terms from all of them and then
+        // use swizzles to assign to color channels.
+        //
+        // Channel Matches
+        // x cross (e.g., EE G)
+        // y checker (e.g., EE B)
+        // z theta (e.g., EO R)
+        // w phi (e.g., EO B)
+
+        #define A value.x  // A0 + A1
+        #define B value.y  // B0 + B1
+        #define E value.z  // E0 + E1
+        #define F value.w  // F0 + F1
+
+        float3 temp;
+
+        // PATTERN.yzw += (kD.yz * D).xyy;
+        temp.x = kDy * D;
+        temp.y = kDz * D;
+        PATTERN.y += temp.x;
+        PATTERN.z += temp.y;
+        PATTERN.w += temp.y;
+
+        // PATTERN += (kA.xyz * A).xyzx;
+        temp.x = kAx * A;
+        temp.y = kAy * A;
+        temp.z = kAz * A;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.z;
+        PATTERN.w += temp.x;
+
+        // PATTERN += (kE.xyw * E).xyxz;
+        temp.x = kEx * E;
+        temp.y = kEy * E;
+        temp.z = kEw * E;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.x;
+        PATTERN.w += temp.z;
+
+        // PATTERN.xw += kB.xw * B;
+        PATTERN.x += kBx * B;
+        PATTERN.w += kBw * B;
+
+        // PATTERN.xz += kF.xz * F;
+        PATTERN.x += kFx * F;
+        PATTERN.z += kFz * F;
+
+        // Determine which of four types of pixels we are on.
+        int2 alternate;
+        alternate.x = (x + firstRed.x) % 2;
+        alternate.y = (y + firstRed.y) % 2;
+
+        // in BGR sequence;
+        uchar3 pixelColor =
+            (alternate.y == 0) ?
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
+                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
+                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+
+        dst(y, x) = toDst<DstType>(pixelColor);
+    }
+
+    template <int cn>
+    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        bindTexture(&sourceTex, src);
+
+        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/disp_bilateral_filter.cu
+++ b/modules/gpu/src/cuda/disp_bilateral_filter.cu
@@ -0,0 +1,223 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace disp_bilateral_filter
+    {
+        __constant__ float* ctable_color;
+        __constant__ float* ctable_space;
+        __constant__ size_t ctable_space_step;
+
+        __constant__ int cndisp;
+        __constant__ int cradius;
+
+        __constant__ short cedge_disc;
+        __constant__ short cmax_disc;
+
+        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
+            size_t table_space_step = table_space.step / sizeof(float);
+            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
+
+            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
+            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
+        }
+
+        template <int channels>
+        struct DistRgbMax
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                uchar x = ::abs(a[0] - b[0]);
+                uchar y = ::abs(a[1] - b[1]);
+                uchar z = ::abs(a[2] - b[2]);
+                return (::max(::max(x, y), z));
+            }
+        };
+
+        template <>
+        struct DistRgbMax<1>
+        {
+            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+            {
+                return ::abs(a[0] - b[0]);
+            }
+        };
+
+        template <int channels, typename T>
+        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+        {
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+            T dp[5];
+
+            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
+            {
+                dp[0] = *(disp + (y  ) * disp_step + x + 0);
+                dp[1] = *(disp + (y-1) * disp_step + x + 0);
+                dp[2] = *(disp + (y  ) * disp_step + x - 1);
+                dp[3] = *(disp + (y+1) * disp_step + x + 0);
+                dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
+                {
+                    const int ymin = ::max(0, y - cradius);
+                    const int xmin = ::max(0, x - cradius);
+                    const int ymax = ::min(h - 1, y + cradius);
+                    const int xmax = ::min(w - 1, x + cradius);
+
+                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+                    const uchar* ic = img + y * img_step + channels * x;
+
+                    for(int yi = ymin; yi <= ymax; yi++)
+                    {
+                        const T* disp_y = disp + yi * disp_step;
+
+                        for(int xi = xmin; xi <= xmax; xi++)
+                        {
+                            const uchar* in = img + yi * img_step + channels * xi;
+
+                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+
+                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
+
+                            const T disp_reg = disp_y[xi];
+
+                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
+                        }
+                    }
+
+                    float minimum = numeric_limits<float>::max();
+                    int id = 0;
+
+                    if (cost[0] < minimum)
+                    {
+                        minimum = cost[0];
+                        id = 0;
+                    }
+                    if (cost[1] < minimum)
+                    {
+                        minimum = cost[1];
+                        id = 1;
+                    }
+                    if (cost[2] < minimum)
+                    {
+                        minimum = cost[2];
+                        id = 2;
+                    }
+                    if (cost[3] < minimum)
+                    {
+                        minimum = cost[3];
+                        id = 3;
+                    }
+                    if (cost[4] < minimum)
+                    {
+                        minimum = cost[4];
+                        id = 4;
+                    }
+
+                    *(disp + y * disp_step + x) = dp[id];
+                }
+            }
+        }
+
+        template <typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(disp.cols, threads.x << 1);
+            grid.y = divUp(disp.rows, threads.y);
+
+            switch (channels)
+            {
+            case 1:
+                for (int i = 0; i < iters; ++i)
+                {
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            case 3:
+                for (int i = 0; i < iters; ++i)
+                {
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+
+                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+                    cudaSafeCall( cudaGetLastError() );
+                }
+                break;
+            default:
+                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+    } // namespace bilateral_filter
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -0,0 +1,801 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "fgd_bgfg_common.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace bgfg
+{
+    ////////////////////////////////////////////////////////////////////////////
+    // calcDiffHistogram
+
+    const unsigned int UINT_BITS = 32U;
+    const int LOG_WARP_SIZE = 5;
+    const int WARP_SIZE = 1 << LOG_WARP_SIZE;
+#if (__CUDA_ARCH__ < 120)
+    const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
+#endif
+
+    const int MERGE_THREADBLOCK_SIZE = 256;
+
+    __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
+    {
+        #if (__CUDA_ARCH__ < 120)
+            volatile unsigned int* s_WarpHist = s_WarpHist_;
+            unsigned int count;
+            do
+            {
+                count = s_WarpHist[data] & TAG_MASK;
+                count = threadTag | (count + 1);
+                s_WarpHist[data] = count;
+            } while (s_WarpHist[data] != count);
+        #else
+            atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
+        #endif
+    }
+
+
+    template <typename PT, typename CT>
+    __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
+    {
+#if (__CUDA_ARCH__ < 200)
+        const int HISTOGRAM_WARP_COUNT = 4;
+#else
+        const int HISTOGRAM_WARP_COUNT = 6;
+#endif
+        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
+        const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
+
+        //Per-warp subhistogram storage
+        __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
+        __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
+        __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
+
+        //Clear shared memory storage for current threadblock before processing
+        #pragma unroll
+        for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
+        {
+           s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+           s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+           s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
+        }
+        __syncthreads();
+
+        const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
+
+        unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
+        unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
+        unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
+
+        const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
+        const int dataCount = prevFrame.rows * prevFrame.cols;
+        for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
+        {
+            const unsigned int y = pos / prevFrame.cols;
+            const unsigned int x = pos % prevFrame.cols;
+
+            PT prevVal = prevFrame(y, x);
+            CT curVal = curFrame(y, x);
+
+            int3 diff = make_int3(
+                ::abs(curVal.x - prevVal.x),
+                ::abs(curVal.y - prevVal.y),
+                ::abs(curVal.z - prevVal.z)
+            );
+
+            addByte(s_WarpHist0, diff.x, tag);
+            addByte(s_WarpHist1, diff.y, tag);
+            addByte(s_WarpHist2, diff.z, tag);
+        }
+        __syncthreads();
+
+        //Merge per-warp histograms into per-block and write to global memory
+        for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
+        {
+            unsigned int sum0 = 0;
+            unsigned int sum1 = 0;
+            unsigned int sum2 = 0;
+
+            #pragma unroll
+            for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
+            {
+                #if (__CUDA_ARCH__ < 120)
+                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
+                #else
+                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
+                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
+                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
+                #endif
+            }
+
+            partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
+            partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
+            partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
+        }
+    }
+
+    __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
+    {
+        unsigned int sum0 = 0;
+        unsigned int sum1 = 0;
+        unsigned int sum2 = 0;
+
+        #pragma unroll
+        for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
+        {
+            sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+            sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+            sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
+        }
+
+        __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
+        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
+        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
+
+        plus<unsigned int> op;
+        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
+
+        if(threadIdx.x == 0)
+        {
+            hist0[blockIdx.x] = sum0;
+            hist1[blockIdx.x] = sum1;
+            hist2[blockIdx.x] = sum2;
+        }
+    }
+
+    template <typename PT, typename CT>
+    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               bool cc20, cudaStream_t stream)
+    {
+        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
+        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
+
+        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
+                (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
+        cudaSafeCall( cudaGetLastError() );
+
+        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
+
+    /////////////////////////////////////////////////////////////////////////
+    // calcDiffThreshMask
+
+    template <typename PT, typename CT>
+    __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
+    {
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (y > prevFrame.rows || x > prevFrame.cols)
+            return;
+
+        PT prevVal = prevFrame(y, x);
+        CT curVal = curFrame(y, x);
+
+        int3 diff = make_int3(
+            ::abs(curVal.x - prevVal.x),
+            ::abs(curVal.y - prevVal.y),
+            ::abs(curVal.z - prevVal.z)
+        );
+
+        if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
+            changeMask(y, x) = 255;
+    }
+
+    template <typename PT, typename CT>
+    void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+    template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
+
+    /////////////////////////////////////////////////////////////////////////
+    // bgfgClassification
+
+    __constant__ BGPixelStat c_stat;
+
+    void setBGPixelStat(const BGPixelStat& stat)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
+    }
+
+    template <typename T> struct Output;
+    template <> struct Output<uchar3>
+    {
+        static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
+        {
+            return make_uchar3(v0, v1, v2);
+        }
+    };
+    template <> struct Output<uchar4>
+    {
+        static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
+        {
+            return make_uchar4(v0, v1, v2, 255);
+        }
+    };
+
+    template <typename PT, typename CT, typename OT>
+    __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
+                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
+                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
+    {
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (i > prevFrame.rows || j > prevFrame.cols)
+            return;
+
+        if (Fbd(i, j) || Ftd(i, j))
+        {
+            float Pb  = 0.0f;
+            float Pv  = 0.0f;
+            float Pvb = 0.0f;
+
+            int val = 0;
+
+            // Is it a motion pixel?
+            if (Ftd(i, j))
+            {
+                if (!c_stat.is_trained_dyn_model(i, j))
+                    val = 1;
+                else
+                {
+                    PT prevVal = prevFrame(i, j);
+                    CT curVal = curFrame(i, j);
+
+                    // Compare with stored CCt vectors:
+                    for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
+                    {
+                        OT v1 = c_stat.V1_CC<OT>(i, j, k);
+                        OT v2 = c_stat.V2_CC<OT>(i, j, k);
+
+                        if (::abs(v1.x - prevVal.x) <= deltaCC &&
+                            ::abs(v1.y - prevVal.y) <= deltaCC &&
+                            ::abs(v1.z - prevVal.z) <= deltaCC &&
+                            ::abs(v2.x - curVal.x) <= deltaCC &&
+                            ::abs(v2.y - curVal.y) <= deltaCC &&
+                            ::abs(v2.z - curVal.z) <= deltaCC)
+                        {
+                            Pv += c_stat.PV_CC(i, j, k);
+                            Pvb += c_stat.PVB_CC(i, j, k);
+                        }
+                    }
+
+                    Pb = c_stat.Pbcc(i, j);
+                    if (2 * Pvb * Pb <= Pv)
+                        val = 1;
+                }
+            }
+            else if(c_stat.is_trained_st_model(i, j))
+            {
+                CT curVal = curFrame(i, j);
+
+                // Compare with stored Ct vectors:
+                for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
+                {
+                    OT v = c_stat.V_C<OT>(i, j, k);
+
+                    if (::abs(v.x - curVal.x) <= deltaC &&
+                        ::abs(v.y - curVal.y) <= deltaC &&
+                        ::abs(v.z - curVal.z) <= deltaC)
+                    {
+                        Pv += c_stat.PV_C(i, j, k);
+                        Pvb += c_stat.PVB_C(i, j, k);
+                    }
+                }
+                Pb = c_stat.Pbc(i, j);
+                if (2 * Pvb * Pb <= Pv)
+                    val = 1;
+            }
+
+            // Update foreground:
+            foreground(i, j) = static_cast<uchar>(val);
+        } // end if( change detection...
+    }
+
+    template <typename PT, typename CT, typename OT>
+    void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
+                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
+
+        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
+                                                                   Ftd, Fbd, foreground,
+                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // updateBackgroundModel
+
+    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
+    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
+                                          PtrStepb foreground, PtrStep<OT> background,
+                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
+    {
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+
+        if (i > rows || j > cols)
+            return;
+
+        const float MIN_PV = 1e-10f;
+
+        const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
+        if (Ftd(i, j) || !is_trained_dyn_model)
+        {
+            const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
+
+            float Pbcc = c_stat.Pbcc(i, j);
+
+            //update Pb
+            Pbcc *= (1.0f - alpha);
+            if (!foreground(i, j))
+            {
+                Pbcc += alpha;
+            }
+
+            int min_dist = numeric_limits<int>::max();
+            int indx = -1;
+
+            PT prevVal = prevFrame(i, j);
+            CT curVal = curFrame(i, j);
+
+            // Find best Vi match:
+            for (int k = 0; k < N2cc; ++k)
+            {
+                float PV_CC = c_stat.PV_CC(i, j, k);
+                if (!PV_CC)
+                    break;
+
+                if (PV_CC < MIN_PV)
+                {
+                    c_stat.PV_CC(i, j, k) = 0;
+                    c_stat.PVB_CC(i, j, k) = 0;
+                    continue;
+                }
+
+                c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
+                c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
+
+                OT v1 = c_stat.V1_CC<OT>(i, j, k);
+
+                int3 val1 = make_int3(
+                    ::abs(v1.x - prevVal.x),
+                    ::abs(v1.y - prevVal.y),
+                    ::abs(v1.z - prevVal.z)
+                );
+
+                OT v2 = c_stat.V2_CC<OT>(i, j, k);
+
+                int3 val2 = make_int3(
+                    ::abs(v2.x - curVal.x),
+                    ::abs(v2.y - curVal.y),
+                    ::abs(v2.z - curVal.z)
+                );
+
+                int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
+
+                if (dist < min_dist &&
+                    val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
+                    val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
+                {
+                    min_dist = dist;
+                    indx = k;
+                }
+            }
+
+            if (indx < 0)
+            {
+                // Replace N2th elem in the table by new feature:
+                indx = N2cc - 1;
+                c_stat.PV_CC(i, j, indx) = alpha;
+                c_stat.PVB_CC(i, j, indx) = alpha;
+
+                //udate Vt
+                c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
+                c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+            else
+            {
+                // Update:
+                c_stat.PV_CC(i, j, indx) += alpha;
+
+                if (!foreground(i, j))
+                {
+                    c_stat.PVB_CC(i, j, indx) += alpha;
+                }
+            }
+
+            //re-sort CCt table by Pv
+            const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
+            const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
+            const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
+            const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
+            for (int k = 0; k < indx; ++k)
+            {
+                if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
+                {
+                    //shift elements
+                    float Pv_tmp1;
+                    float Pv_tmp2 = PV_CC_indx;
+
+                    float Pvb_tmp1;
+                    float Pvb_tmp2 = PVB_CC_indx;
+
+                    OT v1_tmp1;
+                    OT v1_tmp2 = V1_CC_indx;
+
+                    OT v2_tmp1;
+                    OT v2_tmp2 = V2_CC_indx;
+
+                    for (int l = k; l <= indx; ++l)
+                    {
+                        Pv_tmp1 = c_stat.PV_CC(i, j, l);
+                        c_stat.PV_CC(i, j, l) = Pv_tmp2;
+                        Pv_tmp2 = Pv_tmp1;
+
+                        Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
+                        c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
+                        Pvb_tmp2 = Pvb_tmp1;
+
+                        v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
+                        c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
+                        v1_tmp2 = v1_tmp1;
+
+                        v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
+                        c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
+                        v2_tmp2 = v2_tmp1;
+                    }
+
+                    break;
+                }
+            }
+
+            float sum1 = 0.0f;
+            float sum2 = 0.0f;
+
+            //check "once-off" changes
+            for (int k = 0; k < N1cc; ++k)
+            {
+                const float PV_CC = c_stat.PV_CC(i, j, k);
+                if (!PV_CC)
+                    break;
+
+                sum1 += PV_CC;
+                sum2 += c_stat.PVB_CC(i, j, k);
+            }
+
+            if (sum1 > T)
+                c_stat.is_trained_dyn_model(i, j) = 1;
+
+            float diff = sum1 - Pbcc * sum2;
+
+            // Update stat table:
+            if (diff > T)
+            {
+                //new BG features are discovered
+                for (int k = 0; k < N1cc; ++k)
+                {
+                    const float PV_CC = c_stat.PV_CC(i, j, k);
+                    if (!PV_CC)
+                        break;
+
+                    c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
+                }
+            }
+
+            c_stat.Pbcc(i, j) = Pbcc;
+        }
+
+        // Handle "stationary" pixel:
+        if (!Ftd(i, j))
+        {
+            const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
+
+            float Pbc = c_stat.Pbc(i, j);
+
+            //update Pb
+            Pbc *= (1.0f - alpha);
+            if (!foreground(i, j))
+            {
+                Pbc += alpha;
+            }
+
+            int min_dist = numeric_limits<int>::max();
+            int indx = -1;
+
+            CT curVal = curFrame(i, j);
+
+            //find best Vi match
+            for (int k = 0; k < N2c; ++k)
+            {
+                float PV_C = c_stat.PV_C(i, j, k);
+
+                if (PV_C < MIN_PV)
+                {
+                    c_stat.PV_C(i, j, k) = 0;
+                    c_stat.PVB_C(i, j, k) = 0;
+                    continue;
+                }
+
+                // Exponential decay of memory
+                c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
+                c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
+
+                OT v = c_stat.V_C<OT>(i, j, k);
+                int3 val = make_int3(
+                    ::abs(v.x - curVal.x),
+                    ::abs(v.y - curVal.y),
+                    ::abs(v.z - curVal.z)
+                );
+
+                int dist = val.x + val.y + val.z;
+
+                if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
+                {
+                    min_dist = dist;
+                    indx = k;
+                }
+            }
+
+            if (indx < 0)
+            {
+                //N2th elem in the table is replaced by a new features
+                indx = N2c - 1;
+
+                c_stat.PV_C(i, j, indx) = alpha;
+                c_stat.PVB_C(i, j, indx) = alpha;
+
+                //udate Vt
+                c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+            else
+            {
+                //update
+                c_stat.PV_C(i, j, indx) += alpha;
+
+                if (!foreground(i, j))
+                {
+                    c_stat.PVB_C(i, j, indx) += alpha;
+                }
+            }
+
+            //re-sort Ct table by Pv
+            const float PV_C_indx = c_stat.PV_C(i, j, indx);
+            const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
+            OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
+            for (int k = 0; k < indx; ++k)
+            {
+                if (c_stat.PV_C(i, j, k) <= PV_C_indx)
+                {
+                    //shift elements
+                    float Pv_tmp1;
+                    float Pv_tmp2 = PV_C_indx;
+
+                    float Pvb_tmp1;
+                    float Pvb_tmp2 = PVB_C_indx;
+
+                    OT v_tmp1;
+                    OT v_tmp2 = V_C_indx;
+
+                    for (int l = k; l <= indx; ++l)
+                    {
+                        Pv_tmp1 = c_stat.PV_C(i, j, l);
+                        c_stat.PV_C(i, j, l) = Pv_tmp2;
+                        Pv_tmp2 = Pv_tmp1;
+
+                        Pvb_tmp1 = c_stat.PVB_C(i, j, l);
+                        c_stat.PVB_C(i, j, l) = Pvb_tmp2;
+                        Pvb_tmp2 = Pvb_tmp1;
+
+                        v_tmp1 = c_stat.V_C<OT>(i, j, l);
+                        c_stat.V_C<OT>(i, j, l) = v_tmp2;
+                        v_tmp2 = v_tmp1;
+                    }
+
+                    break;
+                }
+            }
+
+            // Check "once-off" changes:
+            float sum1 = 0.0f;
+            float sum2 = 0.0f;
+            for (int k = 0; k < N1c; ++k)
+            {
+                const float PV_C = c_stat.PV_C(i, j, k);
+                if (!PV_C)
+                    break;
+
+                sum1 += PV_C;
+                sum2 += c_stat.PVB_C(i, j, k);
+            }
+
+            if (sum1 > T)
+                c_stat.is_trained_st_model(i, j) = 1;
+
+            float diff = sum1 - Pbc * sum2;
+
+            // Update stat table:
+            if (diff > T)
+            {
+                //new BG features are discovered
+                for (int k = 0; k < N1c; ++k)
+                {
+                    const float PV_C = c_stat.PV_C(i, j, k);
+                    if (!PV_C)
+                        break;
+
+                    c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
+                }
+
+                c_stat.Pbc(i, j) = 1.0f - Pbc;
+            }
+            else
+            {
+                c_stat.Pbc(i, j) = Pbc;
+            }
+        } // if !(change detection) at pixel (i,j)
+
+        // Update the reference BG image:
+        if (!foreground(i, j))
+        {
+            CT curVal = curFrame(i, j);
+
+            if (!Ftd(i, j) && !Fbd(i, j))
+            {
+                // Apply IIR filter:
+                OT oldVal = background(i, j);
+
+                int3 newVal = make_int3(
+                    __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
+                    __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
+                    __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
+                );
+
+                background(i, j) = Output<OT>::make(
+                    static_cast<uchar>(newVal.x),
+                    static_cast<uchar>(newVal.y),
+                    static_cast<uchar>(newVal.z)
+                );
+            }
+            else
+            {
+                background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
+            }
+        }
+    }
+
+    template <typename PT, typename CT, typename OT>
+    struct UpdateBackgroundModel
+    {
+        static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
+                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                         cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
+
+            updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
+                prevFrame.cols, prevFrame.rows,
+                prevFrame, curFrame,
+                Ftd, Fbd, foreground, background,
+                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <typename PT, typename CT, typename OT>
+    void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
+                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                                   cudaStream_t stream)
+    {
+        UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
+                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
+    }
+
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __FGD_BGFG_COMMON_HPP__
+#define __FGD_BGFG_COMMON_HPP__
+
+#include "opencv2/core/cuda_devptrs.hpp"
+
+namespace bgfg
+{
+    struct BGPixelStat
+    {
+    public:
+#ifdef __CUDACC__
+        __device__ float& Pbc(int i, int j);
+        __device__ float& Pbcc(int i, int j);
+
+        __device__ unsigned char& is_trained_st_model(int i, int j);
+        __device__ unsigned char& is_trained_dyn_model(int i, int j);
+
+        __device__ float& PV_C(int i, int j, int k);
+        __device__ float& PVB_C(int i, int j, int k);
+        template <typename T> __device__ T& V_C(int i, int j, int k);
+
+        __device__ float& PV_CC(int i, int j, int k);
+        __device__ float& PVB_CC(int i, int j, int k);
+        template <typename T> __device__ T& V1_CC(int i, int j, int k);
+        template <typename T> __device__ T& V2_CC(int i, int j, int k);
+#endif
+
+        int rows_;
+
+        unsigned char* Pbc_data_;
+        size_t Pbc_step_;
+
+        unsigned char* Pbcc_data_;
+        size_t Pbcc_step_;
+
+        unsigned char* is_trained_st_model_data_;
+        size_t is_trained_st_model_step_;
+
+        unsigned char* is_trained_dyn_model_data_;
+        size_t is_trained_dyn_model_step_;
+
+        unsigned char* ctable_Pv_data_;
+        size_t ctable_Pv_step_;
+
+        unsigned char* ctable_Pvb_data_;
+        size_t ctable_Pvb_step_;
+
+        unsigned char* ctable_v_data_;
+        size_t ctable_v_step_;
+
+        unsigned char* cctable_Pv_data_;
+        size_t cctable_Pv_step_;
+
+        unsigned char* cctable_Pvb_data_;
+        size_t cctable_Pvb_step_;
+
+        unsigned char* cctable_v1_data_;
+        size_t cctable_v1_step_;
+
+        unsigned char* cctable_v2_data_;
+        size_t cctable_v2_step_;
+    };
+
+#ifdef __CUDACC__
+    __device__ __forceinline__ float& BGPixelStat::Pbc(int i, int j)
+    {
+        return *((float*)(Pbc_data_ + i * Pbc_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::Pbcc(int i, int j)
+    {
+        return *((float*)(Pbcc_data_ + i * Pbcc_step_) + j);
+    }
+
+    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_st_model(int i, int j)
+    {
+        return *((unsigned char*)(is_trained_st_model_data_ + i * is_trained_st_model_step_) + j);
+    }
+
+    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_dyn_model(int i, int j)
+    {
+        return *((unsigned char*)(is_trained_dyn_model_data_ + i * is_trained_dyn_model_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PV_C(int i, int j, int k)
+    {
+        return *((float*)(ctable_Pv_data_ + ((k * rows_) + i) * ctable_Pv_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PVB_C(int i, int j, int k)
+    {
+        return *((float*)(ctable_Pvb_data_ + ((k * rows_) + i) * ctable_Pvb_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V_C(int i, int j, int k)
+    {
+        return *((T*)(ctable_v_data_ + ((k * rows_) + i) * ctable_v_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PV_CC(int i, int j, int k)
+    {
+        return *((float*)(cctable_Pv_data_ + ((k * rows_) + i) * cctable_Pv_step_) + j);
+    }
+
+    __device__ __forceinline__ float& BGPixelStat::PVB_CC(int i, int j, int k)
+    {
+        return *((float*)(cctable_Pvb_data_ + ((k * rows_) + i) * cctable_Pvb_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V1_CC(int i, int j, int k)
+    {
+        return *((T*)(cctable_v1_data_ + ((k * rows_) + i) * cctable_v1_step_) + j);
+    }
+
+    template <typename T> __device__ __forceinline__ T& BGPixelStat::V2_CC(int i, int j, int k)
+    {
+        return *((T*)(cctable_v2_data_ + ((k * rows_) + i) * cctable_v2_step_) + j);
+    }
+#endif
+
+    const int PARTIAL_HISTOGRAM_COUNT = 240;
+    const int HISTOGRAM_BIN_COUNT = 256;
+
+    template <typename PT, typename CT>
+    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
+                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
+                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
+                               bool cc20, cudaStream_t stream);
+
+    template <typename PT, typename CT>
+    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
+
+    void setBGPixelStat(const BGPixelStat& stat);
+
+    template <typename PT, typename CT, typename OT>
+    void bgfgClassification_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
+                                cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
+                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
+
+    template <typename PT, typename CT, typename OT>
+    void updateBackgroundModel_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
+                                   cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
+                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
+                                   cudaStream_t stream);
+}
+
+#endif // __FGD_BGFG_COMMON_HPP__
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace gfft
+    {
+        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __device__ int g_counter = 0;
+
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
+        {
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
+            {
+                float val = tex2D(eigTex, j, i);
+
+                if (val > threshold)
+                {
+                    float maxVal = val;
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
+
+                    if (val == maxVal)
+                    {
+                        const int ind = ::atomicAdd(&g_counter, 1);
+
+                        if (ind < max_count)
+                            corners[ind] = make_float2(j, i);
+                    }
+                }
+            }
+        }
+
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+            bindTexture(&eigTex, eig);
+
+            dim3 block(16, 16);
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
+
+            if (mask.data)
+                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+            else
+                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return std::min(count, max_count);
+        }
+
+        class EigGreater
+        {
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
+                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
+            }
+        };
+
+
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        {
+            bindTexture(&eigTex, eig);
+
+            thrust::device_ptr<float2> ptr(corners);
+
+            thrust::sort(ptr, ptr + count, EigGreater());
+        }
+    } // namespace optical_flow
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -0,0 +1,153 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace hist
+{
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __constant__ int c_lut[256];
+
+    struct EqualizeHist : unary_function<uchar, uchar>
+    {
+        float scale;
+
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
+        {
+            const int lut = c_lut[val];
+            return __float2int_rn(scale * lut);
+        }
+    };
+}
+
+namespace cv { namespace gpu { namespace cudev
+{
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        const float scale = 255.0f / (src.cols * src.rows);
+
+        cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -0,0 +1,472 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        // Utility function to extract unsigned chars from an unsigned integer
+        __device__ uchar4 int_to_uchar4(unsigned int in)
+        {
+            uchar4 bytes;
+            bytes.x = (in & 0x000000ff) >>  0;
+            bytes.y = (in & 0x0000ff00) >>  8;
+            bytes.z = (in & 0x00ff0000) >> 16;
+            bytes.w = (in & 0xff000000) >> 24;
+            return bytes;
+        }
+
+        __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ int sums[128];
+
+            const int id = threadIdx.x;
+            const int lane_id = id % warpSize;
+            const int warp_id = id / warpSize;
+
+            const uint4 data = img(blockIdx.x, id);
+
+            const uchar4 a = int_to_uchar4(data.x);
+            const uchar4 b = int_to_uchar4(data.y);
+            const uchar4 c = int_to_uchar4(data.z);
+            const uchar4 d = int_to_uchar4(data.w);
+
+            int result[16];
+
+            result[0]  =              a.x;
+            result[1]  = result[0]  + a.y;
+            result[2]  = result[1]  + a.z;
+            result[3]  = result[2]  + a.w;
+
+            result[4]  = result[3]  + b.x;
+            result[5]  = result[4]  + b.y;
+            result[6]  = result[5]  + b.z;
+            result[7]  = result[6]  + b.w;
+
+            result[8]  = result[7]  + c.x;
+            result[9]  = result[8]  + c.y;
+            result[10] = result[9]  + c.z;
+            result[11] = result[10] + c.w;
+
+            result[12] = result[11] + d.x;
+            result[13] = result[12] + d.y;
+            result[14] = result[13] + d.z;
+            result[15] = result[14] + d.w;
+
+            int sum = result[15];
+
+            // the prefix sum for each thread's 16 value is computed,
+            // now the final sums (result[15]) need to be shared
+            // with the other threads and add.  To do this,
+            // the __shfl_up() instruction is used and a shuffle scan
+            // operation is performed to distribute the sums to the correct
+            // threads
+            #pragma unroll
+            for (int i = 1; i < 32; i *= 2)
+            {
+                const int n = __shfl_up(sum, i, 32);
+
+                if (lane_id >= i)
+                {
+                    #pragma unroll
+                    for (int i = 0; i < 16; ++i)
+                        result[i] += n;
+
+                    sum += n;
+                }
+            }
+
+            // Now the final sum for the warp must be shared
+            // between warps.  This is done by each warp
+            // having a thread store to shared memory, then
+            // having some other warp load the values and
+            // compute a prefix sum, again by using __shfl_up.
+            // The results are uniformly added back to the warps.
+            // last thread in the warp holding sum of the warp
+            // places that in shared
+            if (threadIdx.x % warpSize == warpSize - 1)
+                sums[warp_id] = result[15];
+
+            __syncthreads();
+
+            if (warp_id == 0)
+            {
+                int warp_sum = sums[lane_id];
+
+                #pragma unroll
+                for (int i = 1; i <= 32; i *= 2)
+                {
+                    const int n = __shfl_up(warp_sum, i, 32);
+
+                    if (lane_id >= i)
+                        warp_sum += n;
+                }
+
+                sums[lane_id] = warp_sum;
+            }
+
+            __syncthreads();
+
+            int blockSum = 0;
+
+            // fold in unused warp
+            if (warp_id > 0)
+            {
+                blockSum = sums[warp_id - 1];
+
+                #pragma unroll
+                for (int i = 0; i < 16; ++i)
+                    result[i] += blockSum;
+            }
+
+            // assemble result
+            // Each thread has 16 values to write, which are
+            // now integer data (to avoid overflow).  Instead of
+            // each thread writing consecutive uint4s, the
+            // approach shown here experiments using
+            // the shuffle command to reformat the data
+            // inside the registers so that each thread holds
+            // consecutive data to be written so larger contiguous
+            // segments can be assembled for writing.
+
+            /*
+                For example data that needs to be written as
+
+                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
+                but is stored in registers (r0..r3), in four threads (0..3) as:
+
+                threadId   0  1  2  3
+                  r0      x0 y0 z0 w0
+                  r1      x1 y1 z1 w1
+                  r2      x2 y2 z2 w2
+                  r3      x3 y3 z3 w3
+
+                  after apply __shfl_xor operations to move data between registers r1..r3:
+
+                threadId  00 01 10 11
+                          x0 y0 z0 w0
+                 xor(01)->y1 x1 w1 z1
+                 xor(10)->z2 w2 x2 y2
+                 xor(11)->w3 z3 y3 x3
+
+                 and now x0..x3, and z0..z3 can be written out in order by all threads.
+
+                 In the current code, each register above is actually representing
+                 four integers to be written as uint4's to GMEM.
+            */
+
+            result[4]  = __shfl_xor(result[4] , 1, 32);
+            result[5]  = __shfl_xor(result[5] , 1, 32);
+            result[6]  = __shfl_xor(result[6] , 1, 32);
+            result[7]  = __shfl_xor(result[7] , 1, 32);
+
+            result[8]  = __shfl_xor(result[8] , 2, 32);
+            result[9]  = __shfl_xor(result[9] , 2, 32);
+            result[10] = __shfl_xor(result[10], 2, 32);
+            result[11] = __shfl_xor(result[11], 2, 32);
+
+            result[12] = __shfl_xor(result[12], 3, 32);
+            result[13] = __shfl_xor(result[13], 3, 32);
+            result[14] = __shfl_xor(result[14], 3, 32);
+            result[15] = __shfl_xor(result[15], 3, 32);
+
+            uint4* integral_row = integral.ptr(blockIdx.x);
+            uint4 output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
+
+            // continuning from the above example,
+            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
+            // in order.
+
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+                result[i] = __shfl_xor(result[i], 1, 32);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
+
+            ///////
+
+            if (threadIdx.x % 4 == 2)
+                output = make_uint4(result[0], result[1], result[2], result[3]);
+
+            if (threadIdx.x % 4 == 3)
+                output = make_uint4(result[4], result[5], result[6], result[7]);
+
+            if (threadIdx.x % 4 == 0)
+                output = make_uint4(result[8], result[9], result[10], result[11]);
+
+            if (threadIdx.x % 4 == 1)
+                output = make_uint4(result[12], result[13], result[14], result[15]);
+
+            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
+        #endif
+        }
+
+        // This kernel computes columnwise prefix sums.  When the data input is
+        // the row sums from above, this completes the integral image.
+        // The approach here is to have each block compute a local set of sums.
+        // First , the data covered by the block is loaded into shared memory,
+        // then instead of performing a sum in shared memory using __syncthreads
+        // between stages, the data is reformatted so that the necessary sums
+        // occur inside warps and the shuffle scan operation is used.
+        // The final set of sums from the block is then propgated, with the block
+        // computing "down" the image and adding the running sum to the local
+        // block sums.
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = integral.ptr(y) + tidx;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *p = sum;
+            }
+        #endif
+        }
+
+        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
+        {
+            {
+                // each thread handles 16 values, use 1 block/row
+                // save, becouse step is actually can't be less 512 bytes
+                int block = integral.cols / 16;
+
+                // launch 1 block / row
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
+        {
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
+            __shared__ unsigned int sums[32][9];
+
+            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int lane_id = tidx % 8;
+
+            if (tidx >= integral.cols)
+                return;
+
+            sums[threadIdx.x][threadIdx.y] = 0;
+            __syncthreads();
+
+            unsigned int stepSum = 0;
+
+            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
+            {
+                unsigned int* p = buffer.ptr(y) + tidx;
+                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
+
+                unsigned int sum = *p;
+
+                sums[threadIdx.x][threadIdx.y] = sum;
+                __syncthreads();
+
+                // place into SMEM
+                // shfl scan reduce the SMEM, reformating so the column
+                // sums are computed in a warp
+                // then read out properly
+                const int j = threadIdx.x % 8;
+                const int k = threadIdx.x / 8 + threadIdx.y * 4;
+
+                int partial_sum = sums[k][j];
+
+                for (int i = 1; i <= 8; i *= 2)
+                {
+                    int n = __shfl_up(partial_sum, i, 32);
+
+                    if (lane_id >= i)
+                        partial_sum += n;
+                }
+
+                sums[k][j] = partial_sum;
+                __syncthreads();
+
+                if (threadIdx.y > 0)
+                    sum += sums[threadIdx.x][threadIdx.y - 1];
+
+                sum += stepSum;
+                stepSum += sums[threadIdx.x][blockDim.y - 1];
+
+                __syncthreads();
+
+                *dst = sum;
+            }
+        #endif
+        }
+
+        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
+        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
+            int blockStep, cudaStream_t stream)
+        {
+            {
+                const int block = blockStep;
+                const int grid = img.rows;
+
+                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
+
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            {
+                const dim3 block(32, 8);
+                const dim3 grid(divUp(integral.cols, block.x), 1);
+
+                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_internal_shared_HPP__
+#define __OPENCV_internal_shared_HPP__
+
+#include <cuda_runtime.h>
+#include <npp.h>
+#include "NPP_staging.hpp"
+#include "opencv2/gpu/devmem2d.hpp"
+#include "safe_call.hpp"
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu
+{
+    class NppStStreamHandler
+    {
+    public:
+        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppStSetActiveCUDAstream(newStream);
+        }
+
+        inline ~NppStStreamHandler()
+        {
+            nppStSetActiveCUDAstream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+}}
+
+#endif /* __OPENCV_internal_shared_HPP__ */
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -0,0 +1,916 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace match_template
+    {
+        __device__ __forceinline__ float sum(float v) { return v; }
+        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+
+        __device__ __forceinline__ float first(float v) { return v; }
+        __device__ __forceinline__ float first(float2 v) { return v.x; }
+        __device__ __forceinline__ float first(float3 v) { return v.x; }
+        __device__ __forceinline__ float first(float4 v) { return v.x; }
+
+        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
+        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
+        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_CCORR
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_SQDIFF
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+                Typef delta;
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                    {
+                        delta = sub(image_ptr[x + j], templ_ptr[j]);
+                        res = res + delta * delta;
+                    }
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
+                                             cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF_NORMED
+
+        // normAcc* are accurate normalization routines which make GPU matchTemplate
+        // consistent with CPU one
+
+        __device__ float normAcc(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 0;
+        }
+
+
+        __device__ float normAcc_SQDIFF(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 1;
+        }
+
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
+                                                  sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
+                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC2(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                unsigned int templ_sum_r, unsigned int templ_sum_g,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
+                    image_sum_r, image_sum_g, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                float templ_sum_scale_a,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                const PtrStep<unsigned int> image_sum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b
+                                         - image_sum_a_ * templ_sum_scale_a;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    (float)templ_sum_a / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF_NORMED
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
+                int w, int h, float weight,
+                float templ_sum_scale, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum,
+                const PtrStep<unsigned long long> image_sqsum,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float ccorr = result.ptr(y)[x];
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
+                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                    int w, int h, const PtrStepSz<unsigned int> image_sum,
+                    const PtrStepSz<unsigned long long> image_sqsum,
+                    unsigned int templ_sum, unsigned long long templ_sqsum,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale = templ_sum * weight;
+            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
+                    image_sum, image_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g,
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sum_scale_a, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float image_sqsum_a_ = (float)(
+                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
+                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sum_scale_a = templ_sum_a * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    image_sum_a, image_sqsum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // normalize
+
+        template <int cn>
+        __global__ void normalizeKernel_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 2:
+                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 3:
+                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 4:
+                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // extractFirstChannel
+
+        template <int cn>
+        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
+        {
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef val = ((const Typef*)image.ptr(y))[x];
+                result.ptr(y)[x] = first(val);
+            }
+        }
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 2:
+                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 3:
+                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 4:
+                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            }
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } //namespace match_template
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -0,0 +1,217 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace mathfunc
+    {
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar
+
+        struct Nothing
+        {
+            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
+            {
+            }
+        };
+        struct Magnitude
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
+            }
+        };
+        struct MagnitudeSqr
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
+            }
+        };
+        struct Atan2
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+            {
+                float angle = ::atan2f(y_data, x_data);
+                angle += (angle < 0) * 2.0f * CV_PI_F;
+                dst[y * dst_step + x] = scale * angle;
+            }
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
+                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float x_data = xptr[y * x_step + x];
+                float y_data = yptr[y * y_step + x];
+
+                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
+            }
+        }
+
+        struct NonEmptyMag
+        {
+            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+            {
+                return mag[y * mag_step + x];
+            }
+        };
+        struct EmptyMag
+        {
+            static __device__ __forceinline__ float get(const float*, size_t, int, int)
+            {
+                return 1.0f;
+            }
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float mag_data = Mag::get(mag, mag_step, x, y);
+                float angle_data = angle[y * angle_step + x];
+                float sin_a, cos_a;
+
+                ::sincosf(scale * angle_data, &sin_a, &cos_a);
+
+                xptr[y * x_step + x] = mag_data * cos_a;
+                yptr[y * y_step + x] = mag_data * sin_a;
+            }
+        }
+
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(x.cols, threads.x);
+            grid.y = divUp(x.rows, threads.y);
+
+            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
+
+            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
+                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2][2][2] =
+            {
+                {
+                    {
+                        cartToPolar_caller<Magnitude, Atan2>,
+                        cartToPolar_caller<Magnitude, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<MagnitudeSqr, Atan2>,
+                        cartToPolar_caller<MagnitudeSqr, Nothing>,
+                    }
+                },
+                {
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>,
+                    }
+                }
+            };
+
+            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+        }
+
+        template <typename Mag>
+        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(mag.cols, threads.x);
+            grid.y = divUp(mag.rows, threads.y);
+
+            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
+
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
+                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2] =
+            {
+                polarToCart_caller<NonEmptyMag>,
+                polarToCart_caller<EmptyMag>
+            };
+
+            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -0,0 +1,569 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::gpu;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+//// Non Local Means Denosing
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm2(const float& v) { return v*v; }
+        __device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; }
+        __device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
+        __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z  + v.w*v.w; }
+
+        template<typename T, typename B>
+        __global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            const int i = blockDim.y * blockIdx.y + threadIdx.y;
+            const int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+            if (j >= dst.cols || i >= dst.rows)
+                return;
+
+            int bsize = search_radius + block_radius;
+            int search_window = 2 * search_radius + 1;
+            float minus_search_window2_inv = -1.f/(search_window * search_window);
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0.f;
+
+            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
+            {
+                for(float y = -search_radius; y <= search_radius; ++y)
+                    for(float x = -search_radius; x <= search_radius; ++x)
+                    {
+                        float dist2 = 0;
+                        for(float ty = -block_radius; ty <= block_radius; ++ty)
+                            for(float tx = -block_radius; tx <= block_radius; ++tx)
+                            {
+                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
+                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
+
+                                dist2 += norm2(av - bv);
+                            }
+
+                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
+
+                        /*if (i == 255 && j == 255)
+                            printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/
+
+                        sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x));
+                        sum2 += w;
+                    }
+            }
+            else
+            {
+                for(float y = -search_radius; y <= search_radius; ++y)
+                    for(float x = -search_radius; x <= search_radius; ++x)
+                    {
+                        float dist2 = 0;
+                        for(float ty = -block_radius; ty <= block_radius; ++ty)
+                            for(float tx = -block_radius; tx <= block_radius; ++tx)
+                            {
+                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
+                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
+                                dist2 += norm2(av - bv);
+                            }
+
+                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
+
+                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
+                        sum2 += w;
+                    }
+
+            }
+
+            dst(i, j) = saturate_cast<T>(sum1 / sum2);
+
+        }
+
+        template<typename T, template <typename> class B>
+        void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            int block_window = 2 * block_radius + 1;
+            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
+            float noise_mult = minus_h2_inv/(block_window * block_window);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);
+
+            static func_t funcs[] =
+            {
+                nlm_caller<T, BrdReflect101>,
+                nlm_caller<T, BrdReplicate>,
+                nlm_caller<T, BrdConstant>,
+                nlm_caller<T, BrdReflect>,
+                nlm_caller<T, BrdWrap>,
+            };
+            funcs[borderMode](src, dst, search_radius, block_radius, h, stream);
+        }
+
+        template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
+        template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
+        template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
+    }
+}}}
+
+//////////////////////////////////////////////////////////////////////////////////
+//// Non Local Means Denosing (fast approximate version)
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+
+        template <int cn> struct Unroll;
+        template <> struct Unroll<1>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
+            {
+                return thrust::tie(val1, val2);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op);
+            }
+        };
+        template <> struct Unroll<2>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op);
+            }
+        };
+        template <> struct Unroll<3>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op);
+            }
+        };
+        template <> struct Unroll<4>
+        {
+            template <int BLOCK_SIZE>
+            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
+            {
+                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
+            }
+
+            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
+            {
+                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
+            }
+
+            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
+            {
+                plus<float> op;
+                return thrust::make_tuple(op, op, op, op, op);
+            }
+        };
+
+        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
+        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
+        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
+
+        template <class T> struct FastNonLocalMenas
+        {
+            enum
+            {
+                CTA_SIZE = 128,
+
+                TILE_COLS = 128,
+                TILE_ROWS = 32,
+
+                STRIDE = CTA_SIZE
+            };
+
+            struct plus
+            {
+                __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; }
+            };
+
+            int search_radius;
+            int block_radius;
+
+            int search_window;
+            int block_window;
+            float minus_h2_inv;
+
+            FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
+                search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
+
+            PtrStep<T> src;
+            mutable PtrStepi buffer;
+
+            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+            {
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    dist_sums[index] = 0;
+
+                    for(int tx = 0; tx < block_window; ++tx)
+                        col_sums(tx, index) = 0;
+
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    int ay = i;
+                    int ax = j;
+
+                    int by = i + y - search_radius;
+                    int bx = j + x - search_radius;
+
+#if 1
+                    for (int tx = -block_radius; tx <= block_radius; ++tx)
+                    {
+                        int col_sum = 0;
+                        for (int ty = -block_radius; ty <= block_radius; ++ty)
+                        {
+                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
+
+                            dist_sums[index] += dist;
+                            col_sum += dist;
+                        }
+                        col_sums(tx + block_radius, index) = col_sum;
+                    }
+#else
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                        for (int tx = -block_radius; tx <= block_radius; ++tx)
+                        {
+                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
+
+                            dist_sums[index] += dist;
+                            col_sums(tx + block_radius, index) += dist;
+                        }
+#endif
+
+                    up_col_sums(j, index) = col_sums(block_window - 1, index);
+                }
+            }
+
+            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+            {
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    int ay = i;
+                    int ax = j + block_radius;
+
+                    int by = i + y - search_radius;
+                    int bx = j + x - search_radius + block_radius;
+
+                    int col_sum = 0;
+
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
+
+                    dist_sums[index] += col_sum - col_sums(first, index);
+
+                    col_sums(first, index) = col_sum;
+                    up_col_sums(j, index) = col_sum;
+                }
+            }
+
+            __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+            {
+                int ay = i;
+                int ax = j + block_radius;
+
+                T a_up   = src(ay - block_radius - 1, ax);
+                T a_down = src(ay + block_radius, ax);
+
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    int by = i + y - search_radius;
+                    int bx = j + x - search_radius + block_radius;
+
+                    T b_up   = src(by - block_radius - 1, bx);
+                    T b_down = src(by + block_radius, bx);
+
+                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                    dist_sums[index] += col_sum  - col_sums(first, index);
+                    col_sums(first, index) = col_sum;
+                    up_col_sums(j, index) = col_sum;
+                }
+            }
+
+            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
+
+                float weights_sum = 0;
+                sum_type sum = VecTraits<sum_type>::all(0);
+
+                float bw2_inv = 1.f/(block_window * block_window);
+
+                int sx = j - search_radius;
+                int sy = i - search_radius;
+
+                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+                {
+                    int y = index / search_window;
+                    int x = index - y * search_window;
+
+                    float avg_dist = dist_sums[index] * bw2_inv;
+                    float weight = __expf(avg_dist * minus_h2_inv);
+                    weights_sum += weight;
+
+                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
+                }
+
+                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
+
+                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
+                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
+                                 threadIdx.x,
+                                 Unroll<VecTraits<T>::cn>::op());
+
+                if (threadIdx.x == 0)
+                    dst = saturate_cast<T>(sum / weights_sum);
+            }
+
+            __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
+            {
+                int tbx = blockIdx.x * TILE_COLS;
+                int tby = blockIdx.y * TILE_ROWS;
+
+                int tex = ::min(tbx + TILE_COLS, dst.cols);
+                int tey = ::min(tby + TILE_ROWS, dst.rows);
+
+                PtrStepi col_sums;
+                col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+                col_sums.step = buffer.step;
+
+                PtrStepi up_col_sums;
+                up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+                up_col_sums.step = buffer.step;
+
+                extern __shared__ int dist_sums[]; //search_window * search_window
+
+                int first = 0;
+
+                for (int i = tby; i < tey; ++i)
+                    for (int j = tbx; j < tex; ++j)
+                    {
+                        __syncthreads();
+
+                        if (j == tbx)
+                        {
+                            initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                            first = 0;
+                        }
+                        else
+                        {
+                            if (i == tby)
+                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                            else
+                              shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                            first = (first + 1) % block_window;
+                        }
+
+                        __syncthreads();
+
+                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
+                    }
+            }
+
+        };
+
+        template<typename T>
+        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
+
+        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+        {
+            typedef FastNonLocalMenas<uchar> FNLM;
+            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
+
+            buffer_cols = search_window * search_window * grid.y;
+            buffer_rows = src.cols + block_window * grid.x;
+        }
+
+        template<typename T>
+        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
+                          int search_window, int block_window, float h, cudaStream_t stream)
+        {
+            typedef FastNonLocalMenas<T> FNLM;
+            FNLM fnlm(search_window, block_window, h);
+
+            fnlm.src = (PtrStepSz<T>)src;
+            fnlm.buffer = buffer;
+
+            dim3 block(FNLM::CTA_SIZE, 1);
+            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
+            int smem = search_window * search_window * sizeof(int);
+
+
+            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
+            cudaSafeCall ( cudaGetLastError () );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float,  cudaStream_t);
+        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
+        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
+
+
+
+        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < lab.cols && y < lab.rows)
+            {
+                uchar3 p = lab(y, x);
+                ab(y,x) = make_uchar2(p.y, p.z);
+                l(y,x) = p.x;
+            }
+        }
+
+        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
+        {
+            dim3 b(32, 8);
+            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
+
+            fnlm_split_kernel<<<g, b>>>(lab, l, ab);
+            cudaSafeCall ( cudaGetLastError () );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < lab.cols && y < lab.rows)
+            {
+                uchar2 p = ab(y, x);
+                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
+            }
+        }
+
+        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream)
+        {
+            dim3 b(32, 8);
+            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
+
+            fnlm_merge_kernel<<<g, b>>>(l, ab, lab);
+            cudaSafeCall ( cudaGetLastError () );
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/optflowbm.cu
+++ b/modules/gpu/src/cuda/optflowbm.cu
@@ -0,0 +1,414 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace optflowbm
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    {
+        int s = 0;
+
+        for (int y = 0; y < blockSize.y; ++y)
+        {
+            for (int x = 0; x < blockSize.x; ++x)
+                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+        }
+
+        return s;
+    }
+
+    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
+                                  const short2* ss, const int ssCount)
+    {
+        const int j = blockIdx.x * blockDim.x + threadIdx.x;
+        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (i >= velx.rows || j >= velx.cols)
+            return;
+
+        const int X1 = j * shiftSize.x;
+        const int Y1 = i * shiftSize.y;
+
+        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
+        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
+
+        int X2 = X1 + offX;
+        int Y2 = Y1 + offY;
+
+        int dist = numeric_limits<int>::max();
+
+        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+
+        int countMin = 1;
+        int sumx = offX;
+        int sumy = offY;
+
+        if (dist > acceptLevel)
+        {
+            // do brute-force search
+            for (int k = 0; k < ssCount; ++k)
+            {
+                const short2 ssVal = ss[k];
+
+                const int dx = offX + ssVal.x;
+                const int dy = offY + ssVal.y;
+
+                X2 = X1 + dx;
+                Y2 = Y1 + dy;
+
+                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
+                {
+                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    if (tmpDist < acceptLevel)
+                    {
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                        break;
+                    }
+
+                    if (tmpDist < dist)
+                    {
+                        dist = tmpDist;
+                        sumx = dx;
+                        sumy = dy;
+                        countMin = 1;
+                    }
+                    else if (tmpDist == dist)
+                    {
+                        sumx += dx;
+                        sumy += dy;
+                        countMin++;
+                    }
+                }
+            }
+
+            if (dist > escapeLevel)
+            {
+                sumx = offX;
+                sumy = offY;
+                countMin = 1;
+            }
+        }
+
+        velx(i, j) = static_cast<float>(sumx) / countMin;
+        vely(i, j) = static_cast<float>(sumy) / countMin;
+    }
+
+    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
+              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
+    {
+        bindTexture(&tex_prev, prev);
+        bindTexture(&tex_curr, curr);
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
+
+        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
+                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////
+// Fast approximate version
+
+namespace optflowbm_fast
+{
+    enum
+    {
+        CTA_SIZE = 128,
+
+        TILE_COLS = 128,
+        TILE_ROWS = 32,
+
+        STRIDE = CTA_SIZE
+    };
+
+    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
+    {
+        return ::abs(a - b);
+    }
+
+    template <class T> struct FastOptFlowBM
+    {
+
+        int search_radius;
+        int block_radius;
+
+        int search_window;
+        int block_window;
+
+        PtrStepSz<T> I0;
+        PtrStep<T> I1;
+
+        mutable PtrStepi buffer;
+
+        FastOptFlowBM(int search_window_, int block_window_,
+                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
+                      PtrStepi buffer_) :
+            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
+            search_window(search_window_), block_window(block_window_),
+            I0(I0_), I1(I1_),
+            buffer(buffer_)
+        {
+        }
+
+        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                dist_sums[index] = 0;
+
+                for (int tx = 0; tx < block_window; ++tx)
+                    col_sums(tx, index) = 0;
+
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius;
+
+                for (int tx = -block_radius; tx <= block_radius; ++tx)
+                {
+                    int col_sum = 0;
+                    for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    {
+                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
+
+                        dist_sums[index] += dist;
+                        col_sum += dist;
+                    }
+
+                    col_sums(tx + block_radius, index) = col_sum;
+                }
+
+                up_col_sums(j, index) = col_sums(block_window - 1, index);
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int ay = i;
+                int ax = j + block_radius;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                int col_sum = 0;
+
+                for (int ty = -block_radius; ty <= block_radius; ++ty)
+                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
+
+                dist_sums[index] += col_sum - col_sums(first, index);
+
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
+        {
+            int ay = i;
+            int ax = j + block_radius;
+
+            T a_up   = I0(ay - block_radius - 1, ax);
+            T a_down = I0(ay + block_radius, ax);
+
+            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int y = index / search_window;
+                int x = index - y * search_window;
+
+                int by = i + y - search_radius;
+                int bx = j + x - search_radius + block_radius;
+
+                T b_up   = I1(by - block_radius - 1, bx);
+                T b_down = I1(by + block_radius, bx);
+
+                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
+
+                dist_sums[index] += col_sum  - col_sums(first, index);
+                col_sums(first, index) = col_sum;
+                up_col_sums(j, index) = col_sum;
+            }
+        }
+
+        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
+        {
+            int bestDist = numeric_limits<int>::max();
+            int bestInd = -1;
+
+            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
+            {
+                int curDist = dist_sums[index];
+                if (curDist < bestDist)
+                {
+                    bestDist = curDist;
+                    bestInd = index;
+                }
+            }
+
+            __shared__ int cta_dist_buffer[CTA_SIZE];
+            __shared__ int cta_ind_buffer[CTA_SIZE];
+
+            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
+
+            if (threadIdx.x == 0)
+            {
+                int y = bestInd / search_window;
+                int x = bestInd - y * search_window;
+
+                velx = x - search_radius;
+                vely = y - search_radius;
+            }
+        }
+
+        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
+        {
+            int tbx = blockIdx.x * TILE_COLS;
+            int tby = blockIdx.y * TILE_ROWS;
+
+            int tex = ::min(tbx + TILE_COLS, I0.cols);
+            int tey = ::min(tby + TILE_ROWS, I0.rows);
+
+            PtrStepi col_sums;
+            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
+            col_sums.step = buffer.step;
+
+            PtrStepi up_col_sums;
+            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
+            up_col_sums.step = buffer.step;
+
+            extern __shared__ int dist_sums[]; //search_window * search_window
+
+            int first = 0;
+
+            for (int i = tby; i < tey; ++i)
+            {
+                for (int j = tbx; j < tex; ++j)
+                {
+                    __syncthreads();
+
+                    if (j == tbx)
+                    {
+                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
+                        first = 0;
+                    }
+                    else
+                    {
+                        if (i == tby)
+                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
+                        else
+                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
+
+                        first = (first + 1) % block_window;
+                    }
+
+                    __syncthreads();
+
+                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
+                }
+            }
+        }
+
+    };
+
+    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
+    {
+        fbm(velx, vely);
+    }
+
+    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
+    {
+        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
+
+        buffer_cols = search_window * search_window * grid.y;
+        buffer_rows = src_cols + block_window * grid.x;
+    }
+
+    template <typename T>
+    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
+    {
+        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
+
+        dim3 block(CTA_SIZE, 1);
+        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
+
+        size_t smem = search_window * search_window * sizeof(int);
+
+        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
+        cudaSafeCall ( cudaGetLastError () );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
+}
+
+#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -0,0 +1,220 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace optical_flow
+    {
+        #define NEEDLE_MAP_SCALE 16
+        #define NUM_VERTS_PER_ARROW 6
+
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        {
+            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
+
+            volatile float* u_col_sum = smem;
+            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
+
+            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
+            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
+
+            u_col_sum[threadIdx.x] = 0;
+            v_col_sum[threadIdx.x] = 0;
+
+            #pragma unroll
+            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
+            {
+                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
+                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
+            }
+
+            if (threadIdx.x < 8)
+            {
+                // now add the column sums
+                const uint X = threadIdx.x;
+
+                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
+                }
+
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
+                }
+
+                if (X | 0xf8 == 0xf8)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
+                }
+
+                if (X == 0)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
+                }
+            }
+
+            if (threadIdx.x == 0)
+            {
+                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
+
+                u_col_sum[0] *= coeff;
+                v_col_sum[0] *= coeff;
+
+                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
+                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
+            }
+        }
+
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
+        {
+            const dim3 block(NEEDLE_MAP_SCALE);
+            const dim3 grid(u_avg.cols, u_avg.rows);
+
+            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            // test - just draw a triangle at each pixel
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+
+            float3 v[NUM_VERTS_PER_ARROW];
+
+            if (x < u_avg.cols && y < u_avg.rows)
+            {
+                const float u_avg_val = u_avg(y, x);
+                const float v_avg_val = v_avg(y, x);
+
+                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
+
+                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
+                r = fmin(14.0f * (r / max_flow), 14.0f);
+
+                v[0].z = 1.0f;
+                v[1].z = 0.7f;
+                v[2].z = 0.7f;
+                v[3].z = 0.7f;
+                v[4].z = 0.7f;
+                v[5].z = 1.0f;
+
+                v[0].x = arrow_x;
+                v[0].y = arrow_y;
+                v[5].x = arrow_x;
+                v[5].y = arrow_y;
+
+                v[2].x = arrow_x + r * ::cosf(theta);
+                v[2].y = arrow_y + r * ::sinf(theta);
+                v[3].x = v[2].x;
+                v[3].y = v[2].y;
+
+                r = ::fmin(r, 2.5f);
+
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
+
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
+
+                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[0].x * xscale;
+                vertex_data[indx++] = v[0].y * yscale;
+                vertex_data[indx++] = v[0].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[1].x * xscale;
+                vertex_data[indx++] = v[1].y * yscale;
+                vertex_data[indx++] = v[1].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[2].x * xscale;
+                vertex_data[indx++] = v[2].y * yscale;
+                vertex_data[indx++] = v[2].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[3].x * xscale;
+                vertex_data[indx++] = v[3].y * yscale;
+                vertex_data[indx++] = v[3].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[4].x * xscale;
+                vertex_data[indx++] = v[4].y * yscale;
+                vertex_data[indx++] = v[4].z;
+
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
+                vertex_data[indx++] = v[5].x * xscale;
+                vertex_data[indx++] = v[5].y * yscale;
+                vertex_data[indx++] = v[5].z;
+            }
+        }
+
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            const dim3 block(16);
+            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
+
+            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -0,0 +1,647 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+#define tx threadIdx.x
+#define ty threadIdx.y
+#define bx blockIdx.x
+#define by blockIdx.y
+#define bdx blockDim.x
+#define bdy blockDim.y
+
+#define BORDER_SIZE 5
+#define MAX_KSIZE_HALF 100
+
+namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
+{
+    __constant__ float c_g[8];
+    __constant__ float c_xg[8];
+    __constant__ float c_xxg[8];
+    __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
+
+
+    template <int polyN>
+    __global__ void polynomialExpansion(
+            const int height, const int width, const PtrStepf src, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * (bdx - 2*polyN) + tx - polyN;
+
+        if (y < height)
+        {
+            extern __shared__ float smem[];
+            volatile float *row = smem + tx;
+            int xWarped = ::min(::max(x, 0), width - 1);
+
+            row[0] = src(y, xWarped) * c_g[0];
+            row[bdx] = 0.f;
+            row[2*bdx] = 0.f;
+
+            for (int k = 1; k <= polyN; ++k)
+            {
+                float t0 = src(::max(y - k, 0), xWarped);
+                float t1 = src(::min(y + k, height - 1), xWarped);
+
+                row[0] += c_g[k] * (t0 + t1);
+                row[bdx] += c_xg[k] * (t1 - t0);
+                row[2*bdx] += c_xxg[k] * (t0 + t1);
+            }
+
+            __syncthreads();
+
+            if (tx >= polyN && tx + polyN < bdx && x < width)
+            {
+                float b1 = c_g[0] * row[0];
+                float b3 = c_g[0] * row[bdx];
+                float b5 = c_g[0] * row[2*bdx];
+                float b2 = 0, b4 = 0, b6 = 0;
+
+                for (int k = 1; k <= polyN; ++k)
+                {
+                    b1 += (row[k] + row[-k]) * c_g[k];
+                    b4 += (row[k] + row[-k]) * c_xxg[k];
+                    b2 += (row[k] - row[-k]) * c_xg[k];
+                    b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
+                    b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
+                    b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
+                }
+
+                dst(y, xWarped) = b3*c_ig11;
+                dst(height + y, xWarped) = b2*c_ig11;
+                dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
+                dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
+                dst(4*height + y, xWarped) = b6*c_ig55;
+            }
+        }
+    }
+
+
+    void setPolynomialExpansionConsts(
+            int polyN, const float *g, const float *xg, const float *xxg,
+            float ig11, float ig03, float ig33, float ig55)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
+        cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
+    }
+
+
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
+    {
+        dim3 block(256);
+        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
+        int smem = 3 * block.x * sizeof(float);
+
+        if (polyN == 5)
+            polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
+        else if (polyN == 7)
+            polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __constant__ float c_border[BORDER_SIZE + 1];
+
+    __global__ void updateMatrices(
+            const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
+            const PtrStepf R0, const PtrStepf R1, PtrStepf M)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        if (y < height && x < width)
+        {
+            float dx = flowx(y, x);
+            float dy = flowy(y, x);
+            float fx = x + dx;
+            float fy = y + dy;
+
+            int x1 = floorf(fx);
+            int y1 = floorf(fy);
+            fx -= x1; fy -= y1;
+
+            float r2, r3, r4, r5, r6;
+
+            if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
+            {
+                float a00 = (1.f - fx) * (1.f - fy);
+                float a01 = fx * (1.f - fy);
+                float a10 = (1.f - fx) * fy;
+                float a11 = fx * fy;
+
+                r2 = a00 * R1(y1, x1) +
+                     a01 * R1(y1, x1 + 1) +
+                     a10 * R1(y1 + 1, x1) +
+                     a11 * R1(y1 + 1, x1 + 1);
+
+                r3 = a00 * R1(height + y1, x1) +
+                     a01 * R1(height + y1, x1 + 1) +
+                     a10 * R1(height + y1 + 1, x1) +
+                     a11 * R1(height + y1 + 1, x1 + 1);
+
+                r4 = a00 * R1(2*height + y1, x1) +
+                     a01 * R1(2*height + y1, x1 + 1) +
+                     a10 * R1(2*height + y1 + 1, x1) +
+                     a11 * R1(2*height + y1 + 1, x1 + 1);
+
+                r5 = a00 * R1(3*height + y1, x1) +
+                     a01 * R1(3*height + y1, x1 + 1) +
+                     a10 * R1(3*height + y1 + 1, x1) +
+                     a11 * R1(3*height + y1 + 1, x1 + 1);
+
+                r6 = a00 * R1(4*height + y1, x1) +
+                     a01 * R1(4*height + y1, x1 + 1) +
+                     a10 * R1(4*height + y1 + 1, x1) +
+                     a11 * R1(4*height + y1 + 1, x1 + 1);
+
+                r4 = (R0(2*height + y, x) + r4) * 0.5f;
+                r5 = (R0(3*height + y, x) + r5) * 0.5f;
+                r6 = (R0(4*height + y, x) + r6) * 0.25f;
+            }
+            else
+            {
+                r2 = r3 = 0.f;
+                r4 = R0(2*height + y, x);
+                r5 = R0(3*height + y, x);
+                r6 = R0(4*height + y, x) * 0.5f;
+            }
+
+            r2 = (R0(y, x) - r2) * 0.5f;
+            r3 = (R0(height + y, x) - r3) * 0.5f;
+
+            r2 += r4*dy + r6*dx;
+            r3 += r6*dy + r5*dx;
+
+            float scale =
+                    c_border[::min(x, BORDER_SIZE)] *
+                    c_border[::min(y, BORDER_SIZE)] *
+                    c_border[::min(width - x - 1, BORDER_SIZE)] *
+                    c_border[::min(height - y - 1, BORDER_SIZE)];
+
+            r2 *= scale; r3 *= scale; r4 *= scale;
+            r5 *= scale; r6 *= scale;
+
+            M(y, x) = r4*r4 + r6*r6;
+            M(height + y, x) = (r4 + r5)*r6;
+            M(2*height + y, x) = r5*r5 + r6*r6;
+            M(3*height + y, x) = r4*r2 + r6*r3;
+            M(4*height + y, x) = r6*r2 + r5*r3;
+        }
+    }
+
+
+    void setUpdateMatricesConsts()
+    {
+        static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
+        cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
+    }
+
+
+    void updateMatricesGpu(
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
+            PtrStepSzf M, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
+
+        updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __global__ void updateFlow(
+            const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        if (y < height && x < width)
+        {
+            float g11 = M(y, x);
+            float g12 = M(height + y, x);
+            float g22 = M(2*height + y, x);
+            float h1 = M(3*height + y, x);
+            float h2 = M(4*height + y, x);
+
+            float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
+
+            flowx(y, x) = (g11*h2 - g12*h1) * detInv;
+            flowy(y, x) = (g22*h1 - g12*h2) * detInv;
+        }
+    }
+
+
+    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
+
+        updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    /*__global__ void boxFilter(
+            const int height, const int width, const PtrStepf src,
+            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = ::min(::max(xExt, 0), width - 1);
+
+                row[i] = src(y, xExt);
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal passs
+                row += tx + ksizeHalf;
+                float res = row[0];
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    res += row[-i] + row[i];
+                dst(y, x) = res * boxAreaInv;
+            }
+        }
+    }
+
+
+    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        dim3 block(256);
+        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }*/
+
+
+    __global__ void boxFilter5(
+            const int height, const int width, const PtrStepf src,
+            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+
+        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+        volatile float *row = smem + 5 * ty * smw;
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = ::min(::max(xExt, 0), width - 1);
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] = src(k*height + y, xExt);
+
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        row[k*smw + i] +=
+                                src(k*height + ::max(y - j, 0), xExt) +
+                                src(k*height + ::min(y + j, height - 1), xExt);
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal passs
+
+                row += tx + ksizeHalf;
+                float res[5];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    res[k] = row[k*smw];
+
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        res[k] += row[k*smw - i] + row[k*smw + i];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    dst(k*height + y, x) = res[k] * boxAreaInv;
+            }
+        }
+    }
+
+
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(256);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(128);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
+
+    template <typename Border>
+    __global__ void gaussianBlur(
+            const int height, const int width, const PtrStepf src, const int ksizeHalf,
+            const Border b, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = b.idx_col(xExt);
+                row[i] = src(y, xExt) * c_gKer[0];
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    row[i] +=
+                            (src(b.idx_row_low(y - j), xExt) +
+                             src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal pass
+                row += tx + ksizeHalf;
+                float res = row[0] * c_gKer[0];
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    res += (row[-i] + row[i]) * c_gKer[i];
+                dst(y, x) = res;
+            }
+        }
+    }
+
+
+    void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
+    }
+
+
+    template <typename Border>
+    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows;
+        int width = src.cols;
+
+        dim3 block(256);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
+        Border b(height, width);
+
+        gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void gaussianBlurGpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            gaussianBlurCaller<BrdReflect101<float> >,
+            gaussianBlurCaller<BrdReplicate<float> >,
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+
+    template <typename Border>
+    __global__ void gaussianBlur5(
+            const int height, const int width, const PtrStepf src, const int ksizeHalf,
+            const Border b, PtrStepf dst)
+    {
+        const int y = by * bdy + ty;
+        const int x = bx * bdx + tx;
+
+        extern __shared__ float smem[];
+
+        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
+        volatile float *row = smem + 5 * ty * smw;
+
+        if (y < height)
+        {
+            // Vertical pass
+            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
+            {
+                int xExt = int(bx * bdx) + i - ksizeHalf;
+                xExt = b.idx_col(xExt);
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
+
+                for (int j = 1; j <= ksizeHalf; ++j)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        row[k*smw + i] +=
+                                (src(k*height + b.idx_row_low(y - j), xExt) +
+                                 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
+            }
+
+            if (x < width)
+            {
+                __syncthreads();
+
+                // Horizontal pass
+
+                row += tx + ksizeHalf;
+                float res[5];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    res[k] = row[k*smw] * c_gKer[0];
+
+                for (int i = 1; i <= ksizeHalf; ++i)
+                    #pragma unroll
+                    for (int k = 0; k < 5; ++k)
+                        res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
+
+                #pragma unroll
+                for (int k = 0; k < 5; ++k)
+                    dst(k*height + y, x) = res[k];
+            }
+        }
+    }
+
+
+    template <typename Border, int blockDimX>
+    void gaussianBlur5Caller(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+
+        dim3 block(blockDimX);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+        Border b(height, width);
+
+        gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
+
+        cudaSafeCall(cudaGetLastError());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+
+    void gaussianBlur5Gpu(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            gaussianBlur5Caller<BrdReflect101<float>,256>,
+            gaussianBlur5Caller<BrdReplicate<float>,256>,
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+    void gaussianBlur5Gpu_CC11(
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
+
+        static const caller_t callers[] =
+        {
+            gaussianBlur5Caller<BrdReflect101<float>,128>,
+            gaussianBlur5Caller<BrdReplicate<float>,128>,
+        };
+
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+
+}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -0,0 +1,424 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace orb
+    {
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // cull
+
+        int cull_gpu(int* loc, float* response, int size, int n_points)
+        {
+            thrust::device_ptr<int> loc_ptr(loc);
+            thrust::device_ptr<float> response_ptr(response);
+
+            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+
+            return n_points;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // HarrisResponses
+
+        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
+        {
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+            __shared__ int smem2[8 * 32];
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                const short2 loc = loc_[ptidx];
+
+                const int r = blockSize / 2;
+                const int x0 = loc.x - r;
+                const int y0 = loc.y - r;
+
+                int a = 0, b = 0, c = 0;
+
+                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
+                {
+                    const int i = ind / blockSize;
+                    const int j = ind % blockSize;
+
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
+
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
+
+                    a += Ix * Ix;
+                    b += Iy * Iy;
+                    c += Ix * Iy;
+                }
+
+                int* srow0 = smem0 + threadIdx.y * blockDim.x;
+                int* srow1 = smem1 + threadIdx.y * blockDim.x;
+                int* srow2 = smem2 + threadIdx.y * blockDim.x;
+
+                plus<int> op;
+                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
+
+                if (threadIdx.x == 0)
+                {
+                    float scale = (1 << 2) * blockSize * 255.0f;
+                    scale = 1.0f / scale;
+                    const float scale_sq_sq = scale * scale * scale * scale;
+
+                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
+                }
+            }
+        }
+
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // IC_Angle
+
+        __constant__ int c_u_max[32];
+
+        void loadUMax(const int* u_max, int count)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
+        }
+
+        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
+        {
+            __shared__ int smem0[8 * 32];
+            __shared__ int smem1[8 * 32];
+
+            int* srow0 = smem0 + threadIdx.y * blockDim.x;
+            int* srow1 = smem1 + threadIdx.y * blockDim.x;
+
+            plus<int> op;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                int m_01 = 0, m_10 = 0;
+
+                const short2 loc = loc_[ptidx];
+
+                // Treat the center line differently, v=0
+                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
+                    m_10 += u * image(loc.y, loc.x + u);
+
+                reduce<32>(srow0, m_10, threadIdx.x, op);
+
+                for (int v = 1; v <= half_k; ++v)
+                {
+                    // Proceed over the two lines
+                    int v_sum = 0;
+                    int m_sum = 0;
+                    const int d = c_u_max[v];
+
+                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
+                    {
+                        int val_plus = image(loc.y + v, loc.x + u);
+                        int val_minus = image(loc.y - v, loc.x + u);
+
+                        v_sum += (val_plus - val_minus);
+                        m_sum += u * (val_plus + val_minus);
+                    }
+
+                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
+
+                    m_10 += m_sum;
+                    m_01 += v * v_sum;
+                }
+
+                if (threadIdx.x == 0)
+                {
+                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
+                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
+                    kp_dir *= 180.0f / CV_PI;
+
+                    angle[ptidx] = kp_dir;
+                }
+            }
+        }
+
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // computeOrbDescriptor
+
+        template <int WTA_K> struct OrbDescriptor;
+
+        #define GET_VALUE(idx) \
+            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
+                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
+
+        template <> struct OrbDescriptor<2>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                val = t0 < t1;
+
+                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
+                val |= (t0 < t1) << 1;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                val |= (t0 < t1) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
+                val |= (t0 < t1) << 3;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                val |= (t0 < t1) << 4;
+
+                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
+                val |= (t0 < t1) << 5;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                val |= (t0 < t1) << 6;
+
+                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
+                val |= (t0 < t1) << 7;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<3>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 12 * i;
+                pattern_y += 12 * i;
+
+                int t0, t1, t2, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
+                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
+
+                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
+
+                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<4>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, t2, t3, k, val;
+                int a, b;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val = k;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 2;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 4;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 6;
+
+                return val;
+            }
+        };
+
+        #undef GET_VALUE
+
+        template <int WTA_K>
+        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
+        {
+            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints && descidx < dsize)
+            {
+                float angle = angle_[ptidx];
+                angle *= (float)(CV_PI / 180.f);
+
+                float sina, cosa;
+                ::sincosf(angle, &sina, &cosa);
+
+                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
+            }
+        }
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(dsize, block.x);
+            grid.y = divUp(npoints, block.y);
+
+            switch (WTA_K)
+            {
+            case 2:
+                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 3:
+                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 4:
+                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // mergeLocation
+
+        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
+        {
+            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (ptidx < npoints)
+            {
+                short2 loc = loc_[ptidx];
+
+                x[ptidx] = loc.x * scale;
+                y[ptidx] = loc.y * scale;
+            }
+        }
+
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
+        {
+            dim3 block(256);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.x);
+
+            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -0,0 +1,228 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
+
+            __shared__ work_t smem[256 + 4];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
+
+            const int src_y = 2 * y;
+
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, x);
+                    sum = sum + 0.25f   * src(src_y - 1, x);
+                    sum = sum + 0.375f  * src(src_y    , x);
+                    sum = sum + 0.25f   * src(src_y + 1, x);
+                    sum = sum + 0.0625f * src(src_y + 2, x);
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, left_x);
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);
+                    sum = sum + 0.375f  * src(src_y    , left_x);
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, right_x);
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);
+                    sum = sum + 0.375f  * src(src_y    , right_x);
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+            else
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                work_t sum;
+
+                sum =       0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -0,0 +1,196 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            __shared__ sum_t s_srcPatch[10][10];
+            __shared__ sum_t s_dstPatch[20][16];
+
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+            {
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);
+
+                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
+            }
+
+            __syncthreads();
+
+            sum_t sum = VecTraits<sum_t>::all(0);
+
+            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+            const bool eveny = ((threadIdx.y & 1) == 0);
+            const int tidx = threadIdx.x;
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+            }
+
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            sum = VecTraits<sum_t>::all(0);
+
+            const int tidy = threadIdx.y;
+
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+            if (x < dst.cols && y < dst.rows)
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
+        }
+
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -0,0 +1,560 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+
+using namespace cv::gpu;
+using namespace cv::gpu::cudev;
+
+namespace pyrlk
+{
+    __constant__ int c_winSize_x;
+    __constant__ int c_winSize_y;
+    __constant__ int c_halfWin_x;
+    __constant__ int c_halfWin_y;
+    __constant__ int c_iters;
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+
+    template <int cn> struct Tex_I;
+    template <> struct Tex_I<1>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_If, x, y);
+        }
+    };
+    template <> struct Tex_I<4>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_If4, x, y);
+        }
+    };
+
+    template <int cn> struct Tex_J;
+    template <> struct Tex_J<1>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_Jf, x, y);
+        }
+    };
+    template <> struct Tex_J<4>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_Jf4, x, y);
+        }
+    };
+
+    __device__ __forceinline__ void accum(float& dst, float val)
+    {
+        dst += val;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float4& val)
+    {
+        dst += val.x + val.y + val.z;
+    }
+
+    __device__ __forceinline__ float abs_(float a)
+    {
+        return ::fabsf(a);
+    }
+    __device__ __forceinline__ float4 abs_(const float4& a)
+    {
+        return abs(a);
+    }
+
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
+    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    {
+    #if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
+    #else
+        const int BLOCK_SIZE = 256;
+    #endif
+
+        __shared__ float smem1[BLOCK_SIZE];
+        __shared__ float smem2[BLOCK_SIZE];
+        __shared__ float smem3[BLOCK_SIZE];
+
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        float2 prevPt = prevPts[blockIdx.x];
+        prevPt.x *= (1.0f / (1 << level));
+        prevPt.y *= (1.0f / (1 << level));
+
+        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        prevPt.x -= c_halfWin_x;
+        prevPt.y -= c_halfWin_y;
+
+        // extract the patch from the first image, compute covariation matrix of derivatives
+
+        float A11 = 0;
+        float A12 = 0;
+        float A22 = 0;
+
+        typedef typename TypeVec<float, cn>::vec_type work_type;
+
+        work_type I_patch   [PATCH_Y][PATCH_X];
+        work_type dIdx_patch[PATCH_Y][PATCH_X];
+        work_type dIdy_patch[PATCH_Y][PATCH_X];
+
+        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
+        {
+            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
+            {
+                float x = prevPt.x + xBase + 0.5f;
+                float y = prevPt.y + yBase + 0.5f;
+
+                I_patch[i][j] = Tex_I<cn>::read(x, y);
+
+                // Sharr Deriv
+
+                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
+
+                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
+
+                dIdx_patch[i][j] = dIdx;
+                dIdy_patch[i][j] = dIdy;
+
+                accum(A11, dIdx * dIdx);
+                accum(A12, dIdx * dIdy);
+                accum(A22, dIdy * dIdy);
+            }
+        }
+
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+
+    #if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+    #endif
+
+        __syncthreads();
+
+        A11 = smem1[0];
+        A12 = smem2[0];
+        A22 = smem3[0];
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt = nextPts[blockIdx.x];
+        nextPt.x *= 2.f;
+        nextPt.y *= 2.f;
+
+        nextPt.x -= c_halfWin_x;
+        nextPt.y -= c_halfWin_y;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+            {
+                if (tid == 0 && level == 0)
+                    status[blockIdx.x] = 0;
+
+                return;
+            }
+
+            float b1 = 0;
+            float b2 = 0;
+
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type diff = (J_val - I_val) * 32.0f;
+
+                    accum(b1, diff * dIdx_patch[i][j]);
+                    accum(b2, diff * dIdy_patch[i][j]);
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+
+        #if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+        #endif
+
+            __syncthreads();
+
+            b1 = smem1[0];
+            b2 = smem2[0];
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        float errval = 0;
+        if (calcErr)
+        {
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type diff = J_val - I_val;
+
+                    accum(errval, abs_(diff));
+                }
+            }
+
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
+        }
+
+        if (tid == 0)
+        {
+            nextPt.x += c_halfWin_x;
+            nextPt.y += c_halfWin_y;
+
+            nextPts[blockIdx.x] = nextPt;
+
+            if (calcErr)
+                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
+        }
+    }
+
+    template <int cn, int PATCH_X, int PATCH_Y>
+    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                       int level, dim3 block, cudaStream_t stream)
+    {
+        dim3 grid(ptcount);
+
+        if (level == 0 && err)
+            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+        else
+            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <bool calcErr>
+    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    {
+        extern __shared__ int smem[];
+
+        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
+        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
+
+        int* I_patch = smem;
+        int* dIdx_patch = I_patch + patchWidth * patchHeight;
+        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
+
+        const int xBase = blockIdx.x * blockDim.x;
+        const int yBase = blockIdx.y * blockDim.y;
+
+        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
+        {
+            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
+            {
+                float x = xBase - c_halfWin_x + j + 0.5f;
+                float y = yBase - c_halfWin_y + i + 0.5f;
+
+                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
+
+                // Sharr Deriv
+
+                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
+                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
+
+                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
+                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
+            }
+        }
+
+        __syncthreads();
+
+        const int x = xBase + threadIdx.x;
+        const int y = yBase + threadIdx.y;
+
+        if (x >= cols || y >= rows)
+            return;
+
+        int A11i = 0;
+        int A12i = 0;
+        int A22i = 0;
+
+        for (int i = 0; i < c_winSize_y; ++i)
+        {
+            for (int j = 0; j < c_winSize_x; ++j)
+            {
+                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                A11i += dIdx * dIdx;
+                A12i += dIdx * dIdy;
+                A22i += dIdy * dIdy;
+            }
+        }
+
+        float A11 = A11i;
+        float A12 = A12i;
+        float A22 = A22i;
+
+        float D = A11 * A22 - A12 * A12;
+
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (calcErr)
+                err(y, x) = numeric_limits<float>::max();
+
+            return;
+        }
+
+        D = 1.f / D;
+
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+
+        float2 nextPt;
+        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
+        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
+
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
+            {
+                if (calcErr)
+                    err(y, x) = numeric_limits<float>::max();
+
+                return;
+            }
+
+            int b1 = 0;
+            int b2 = 0;
+
+            for (int i = 0; i < c_winSize_y; ++i)
+            {
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    int diff = (J - I) * 32;
+
+                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
+
+                    b1 += diff * dIdx;
+                    b2 += diff * dIdy;
+                }
+            }
+
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+
+        u(y, x) = nextPt.x - x;
+        v(y, x) = nextPt.y - y;
+
+        if (calcErr)
+        {
+            int errval = 0;
+
+            for (int i = 0; i < c_winSize_y; ++i)
+            {
+                for (int j = 0; j < c_winSize_x; ++j)
+                {
+                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
+                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    errval += ::abs(J - I);
+                }
+            }
+
+            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
+        }
+    }
+
+    void loadConstants(int2 winSize, int iters)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+
+        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+    }
+
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
+            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
+            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
+            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
+            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
+        };
+
+        bindTexture(&tex_If, I);
+        bindTexture(&tex_Jf, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+
+        static const func_t funcs[5][5] =
+        {
+            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
+            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
+            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
+            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
+            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
+        };
+
+        bindTexture(&tex_If4, I);
+        bindTexture(&tex_Jf4, J);
+
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
+    {
+        dim3 block(16, 16);
+        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
+
+        bindTexture(&tex_Ib, I);
+        bindTexture(&tex_Jf, J);
+
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        const int patchWidth  = block.x + 2 * halfWin.x;
+        const int patchHeight = block.y + 2 * halfWin.y;
+        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
+
+        if (err.data)
+        {
+            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+        else
+        {
+            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -0,0 +1,274 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, bool) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
+            }
+        };
+
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
+
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+        }
+
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -0,0 +1,302 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <cfloat>
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x * fx;
+                const float ycoo = y * fy;
+
+                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                dst(y, x) = saturate_cast<T>(src(y, x));
+            }
+        }
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_resize_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                const int xoff; \
+                const int yoff; \
+                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                if (stream == 0)
+                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
+                else
+                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> struct ResizeDispatcher<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                int iscale_x = (int)round(fx);
+                int iscale_y = (int)round(fy);
+
+                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
+                else
+                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+            PtrStepSzb dst, int interpolation, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
+
+            static const caller_t callers[4] =
+            {
+                ResizeDispatcher<PointFilter, T>::call,
+                ResizeDispatcher<LinearFilter, T>::call,
+                ResizeDispatcher<CubicFilter, T>::call,
+                ResizeDispatcher<AreaFilter, T>::call
+            };
+            // chenge to linear if area interpolation upscaling
+            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+                interpolation = 1;
+
+            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
+                static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template<typename T> struct scan_traits{};
+
+        template<> struct scan_traits<uchar>
+        {
+            typedef float scan_line_type;
+        };
+
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@@ -0,0 +1,175 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+
+namespace cv { namespace gpu { namespace cudev
+{
+    namespace video_encoding
+    {
+        __device__ __forceinline__ void rgbtoy(const uchar b, const uchar g, const uchar r, uchar& y)
+        {
+            y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
+        }
+
+        __device__ __forceinline__ void rgbtoyuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
+        {
+            rgbtoy(b, g, r, y);
+            u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
+            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
+        }
+
+        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+            if (x + 1 >= src.cols || y + 1 >= src.rows)
+                return;
+
+            // get pointers to the data
+            const size_t planeSize = src.rows * dst.step;
+            PtrStepb y_plane(dst.data, dst.step);
+            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
+            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+
+            uchar pix;
+            uchar y_val, u_val, v_val;
+
+            pix = src(y, x);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y, x) = y_val;
+
+            pix = src(y, x + 1);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y, x + 1) = y_val;
+
+            pix = src(y + 1, x);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y + 1, x) = y_val;
+
+            pix = src(y + 1, x + 1);
+            rgbtoyuv(pix, pix, pix, y_val, u_val, v_val);
+            y_plane(y + 1, x + 1) = y_val;
+            u_plane(y / 2, x / 2) = u_val;
+            v_plane(y / 2, x / 2) = v_val;
+        }
+
+        template <typename T>
+        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+            if (x + 1 >= src.cols || y + 1 >= src.rows)
+                return;
+
+            // get pointers to the data
+            const size_t planeSize = src.rows * dst.step;
+            PtrStepb y_plane(dst.data, dst.step);
+            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
+            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+
+            T pix;
+            uchar y_val, u_val, v_val;
+
+            pix = src(y, x);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y, x) = y_val;
+
+            pix = src(y, x + 1);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y, x + 1) = y_val;
+
+            pix = src(y + 1, x);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y + 1, x) = y_val;
+
+            pix = src(y + 1, x + 1);
+            rgbtoyuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
+            y_plane(y + 1, x + 1) = y_val;
+            u_plane(y / 2, x / 2) = u_val;
+            v_plane(y / 2, x / 2) = v_val;
+        }
+
+        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+            Gray_to_YV12<<<grid, block>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <int cn>
+        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
+        {
+            typedef typename TypeVec<uchar, cn>::vec_type src_t;
+
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
+        {
+            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);
+
+            static const func_t funcs[] =
+            {
+                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
+            };
+
+            funcs[cn](src, dst);
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.0.cu
+++ b/modules/gpu/src/cuda/row_filter.0.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.1.cu
+++ b/modules/gpu/src/cuda/row_filter.1.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.2.cu
+++ b/modules/gpu/src/cuda/row_filter.2.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@@ -0,0 +1,52 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "row_filter.h"
+
+namespace filter
+{
+    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+}
+
+#endif /* CUDA_DISABLER */
--- a/Show More
+++ b/Show More