Normalize line endings and whitespace

2012-10-17 11:12:04 +04:00
parent 0442bca235
commit 81f826db2b
1511 changed files with 258678 additions and 258624 deletions
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -44,7 +44,7 @@ if (HAVE_CUDA)
      set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler /wd4251)
    endif()
  endif()
-  
+
  ocv_cuda_compile(cuda_objs ${lib_cuda} ${ncv_cuda})

  #CUDA_BUILD_CLEAN_TARGET()
--- a/modules/gpu/doc/video.rst
+++ b/modules/gpu/doc/video.rst
--- a/modules/gpu/include/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/border_interpolate.hpp
--- a/modules/gpu/include/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/color.hpp
@@ -1,221 +1,221 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_COLOR_HPP__
-#define __OPENCV_GPU_COLOR_HPP__
-
-#include "detail/color_detail.hpp"
-
-namespace cv { namespace gpu { namespace device 
-{
-    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
-    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
-    // {
-    //     typedef ... functor_type;
-    //     static __host__ __device__ functor_type create_functor();
-    // };
-
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
-
-    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
-
-    #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
-
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
-
-    #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_COLOR_HPP__
+#define __OPENCV_GPU_COLOR_HPP__
+
+#include "detail/color_detail.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
+    // {
+    //     typedef ... functor_type;
+    //     static __host__ __device__ functor_type create_functor();
+    // };
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+
+    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
+
+    #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS
+
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+
+    #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/common.hpp
@@ -1,114 +1,114 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_COMMON_HPP__
-#define __OPENCV_GPU_COMMON_HPP__
-
-#include <cuda_runtime.h>
-#include "opencv2/core/cuda_devptrs.hpp"
-
-#ifndef CV_PI
-    #define CV_PI   3.1415926535897932384626433832795
-#endif
-
-#ifndef CV_PI_F
-    #ifndef CV_PI
-        #define CV_PI_F 3.14159265f
-    #else
-        #define CV_PI_F ((float)CV_PI)
-    #endif
-#endif
-
-#if defined(__GNUC__)
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-namespace cv { namespace gpu
-{
-    void error(const char *error_string, const char *file, const int line, const char *func);
-
-    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
-    {
-        return reinterpret_cast<size_t>(ptr) % size == 0;
-    }
-
-    static inline bool isAligned(size_t step, size_t size)
-    {
-        return step % size == 0;
-    }
-}}
-
-static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-{
-    if (cudaSuccess != err)
-        cv::gpu::error(cudaGetErrorString(err), file, line, func);
-}
-
-#ifdef __CUDACC__
-
-namespace cv { namespace gpu
-{
-    __host__ __device__ __forceinline__ int divUp(int total, int grain)
-    {
-        return (total + grain - 1) / grain;
-    }
-
-    namespace device
-    {
-        typedef unsigned char uchar;
-        typedef unsigned short ushort;
-        typedef signed char schar;
-        typedef unsigned int uint;
-
-        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
-        {
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-        }
-    }
-}}
-
-#endif // __CUDACC__
-
-#endif // __OPENCV_GPU_COMMON_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_COMMON_HPP__
+#define __OPENCV_GPU_COMMON_HPP__
+
+#include <cuda_runtime.h>
+#include "opencv2/core/cuda_devptrs.hpp"
+
+#ifndef CV_PI
+    #define CV_PI   3.1415926535897932384626433832795
+#endif
+
+#ifndef CV_PI_F
+    #ifndef CV_PI
+        #define CV_PI_F 3.14159265f
+    #else
+        #define CV_PI_F ((float)CV_PI)
+    #endif
+#endif
+
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+namespace cv { namespace gpu
+{
+    void error(const char *error_string, const char *file, const int line, const char *func);
+
+    template <typename T> static inline bool isAligned(const T* ptr, size_t size)
+    {
+        return reinterpret_cast<size_t>(ptr) % size == 0;
+    }
+
+    static inline bool isAligned(size_t step, size_t size)
+    {
+        return step % size == 0;
+    }
+}}
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+{
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
+}
+
+#ifdef __CUDACC__
+
+namespace cv { namespace gpu
+{
+    __host__ __device__ __forceinline__ int divUp(int total, int grain)
+    {
+        return (total + grain - 1) / grain;
+    }
+
+    namespace device
+    {
+        typedef unsigned char uchar;
+        typedef unsigned short ushort;
+        typedef signed char schar;
+        typedef unsigned int uint;
+
+        template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
+        {
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+        }
+    }
+}}
+
+#endif // __CUDACC__
+
+#endif // __OPENCV_GPU_COMMON_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/datamov_utils.hpp
@@ -1,105 +1,105 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
-#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
-
-#include "common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
-
-        // for Fermi memory space is detected automatically
-        template <typename T> struct ForceGlob
-        {
-            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
-        };
-
-    #else // __CUDA_ARCH__ >= 200
-
-        #if defined(_WIN64) || defined(__LP64__)
-            // 64-bit register modifier for inlined asm
-            #define OPENCV_GPU_ASM_PTR "l"
-        #else
-            // 32-bit register modifier for inlined asm
-            #define OPENCV_GPU_ASM_PTR "r"
-        #endif
-
-        template<class T> struct ForceGlob;
-
-        #define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
-            template <> struct ForceGlob<base_type> \
-            { \
-                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
-                { \
-                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
-                } \
-            };
-
-        #define OPENCV_GPU_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
-            template <> struct ForceGlob<base_type> \
-            { \
-                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
-                { \
-                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
-                } \
-            };
-
-            OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
-            OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
-            OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)
-
-        #undef OPENCV_GPU_DEFINE_FORCE_GLOB
-        #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
-        #undef OPENCV_GPU_ASM_PTR
-
-    #endif // __CUDA_ARCH__ >= 200
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
+#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
+
+#include "common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
+
+        // for Fermi memory space is detected automatically
+        template <typename T> struct ForceGlob
+        {
+            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
+        };
+
+    #else // __CUDA_ARCH__ >= 200
+
+        #if defined(_WIN64) || defined(__LP64__)
+            // 64-bit register modifier for inlined asm
+            #define OPENCV_GPU_ASM_PTR "l"
+        #else
+            // 32-bit register modifier for inlined asm
+            #define OPENCV_GPU_ASM_PTR "r"
+        #endif
+
+        template<class T> struct ForceGlob;
+
+        #define OPENCV_GPU_DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+        #define OPENCV_GPU_DEFINE_FORCE_GLOB_B(base_type, ptx_type) \
+            template <> struct ForceGlob<base_type> \
+            { \
+                __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val) \
+                { \
+                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
+                } \
+            };
+
+            OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
+            OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
+            OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)
+
+        #undef OPENCV_GPU_DEFINE_FORCE_GLOB
+        #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
+        #undef OPENCV_GPU_ASM_PTR
+
+    #endif // __CUDA_ARCH__ >= 200
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/reduction_detail.hpp
--- a/modules/gpu/include/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/transform_detail.hpp
@@ -1,395 +1,395 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
-#define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
-
-#include "../common.hpp"
-#include "../vec_traits.hpp"
-#include "../functional.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace transform_detail
-    {
-        //! Read Write Traits
-
-        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
-        {
-            typedef typename TypeVec<T, shift>::vec_type read_type;
-            typedef typename TypeVec<D, shift>::vec_type write_type;
-        };
-
-        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
-        {
-            typedef typename TypeVec<T1, shift>::vec_type read_type1;
-            typedef typename TypeVec<T2, shift>::vec_type read_type2;
-            typedef typename TypeVec<D, shift>::vec_type write_type;
-        };
-
-        //! Transform kernels
-
-        template <int shift> struct OpUnroller;
-        template <> struct OpUnroller<1>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-            }
-        };
-        template <> struct OpUnroller<2>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src.y);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src1.y, src2.y);
-            }
-        };
-        template <> struct OpUnroller<3>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src.z);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src1.y, src2.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src1.z, src2.z);
-            }
-        };
-        template <> struct OpUnroller<4>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src.z);
-                if (mask(y, x_shifted + 3))
-                    dst.w = op(src.w);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src1.y, src2.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src1.z, src2.z);
-                if (mask(y, x_shifted + 3))
-                    dst.w = op(src1.w, src2.w);
-            }
-        };
-        template <> struct OpUnroller<8>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.a0 = op(src.a0);
-                if (mask(y, x_shifted + 1))
-                    dst.a1 = op(src.a1);
-                if (mask(y, x_shifted + 2))
-                    dst.a2 = op(src.a2);
-                if (mask(y, x_shifted + 3))
-                    dst.a3 = op(src.a3);
-                if (mask(y, x_shifted + 4))
-                    dst.a4 = op(src.a4);
-                if (mask(y, x_shifted + 5))
-                    dst.a5 = op(src.a5);
-                if (mask(y, x_shifted + 6))
-                    dst.a6 = op(src.a6);
-                if (mask(y, x_shifted + 7))
-                    dst.a7 = op(src.a7);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.a0 = op(src1.a0, src2.a0);
-                if (mask(y, x_shifted + 1))
-                    dst.a1 = op(src1.a1, src2.a1);
-                if (mask(y, x_shifted + 2))
-                    dst.a2 = op(src1.a2, src2.a2);
-                if (mask(y, x_shifted + 3))
-                    dst.a3 = op(src1.a3, src2.a3);
-                if (mask(y, x_shifted + 4))
-                    dst.a4 = op(src1.a4, src2.a4);
-                if (mask(y, x_shifted + 5))
-                    dst.a5 = op(src1.a5, src2.a5);
-                if (mask(y, x_shifted + 6))
-                    dst.a6 = op(src1.a6, src2.a6);
-                if (mask(y, x_shifted + 7))
-                    dst.a7 = op(src1.a7, src2.a7);
-            }
-        };
-
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
-        {
-            typedef TransformFunctorTraits<UnOp> ft;
-            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
-            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
-
-            const int x = threadIdx.x + blockIdx.x * blockDim.x;
-            const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * ft::smart_shift;
-
-            if (y < src_.rows)
-            {
-                const T* src = src_.ptr(y);
-                D* dst = dst_.ptr(y);
-
-                if (x_shifted + ft::smart_shift - 1 < src_.cols)
-                {
-                    const read_type src_n_el = ((const read_type*)src)[x];
-                    write_type dst_n_el = ((const write_type*)dst)[x];
-
-                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                    ((write_type*)dst)[x] = dst_n_el;
-                }
-                else
-                {
-                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
-                    {
-                        if (mask(y, real_x))
-                            dst[real_x] = op(src[real_x]);
-                    }
-                }
-            }
-        }
-
-        template <typename T, typename D, typename UnOp, typename Mask>
-        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < src.cols && y < src.rows && mask(y, x))
-            {
-                dst.ptr(y)[x] = op(src.ptr(y)[x]);
-            }
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
-            const Mask mask, const BinOp op)
-        {
-            typedef TransformFunctorTraits<BinOp> ft;
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
-
-            const int x = threadIdx.x + blockIdx.x * blockDim.x;
-            const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * ft::smart_shift;
-
-            if (y < src1_.rows)
-            {
-                const T1* src1 = src1_.ptr(y);
-                const T2* src2 = src2_.ptr(y);
-                D* dst = dst_.ptr(y);
-
-                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
-                {
-                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
-                    write_type dst_n_el = ((const write_type*)dst)[x];
-
-                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                    ((write_type*)dst)[x] = dst_n_el;
-                }
-                else
-                {
-                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
-                    {
-                        if (mask(y, real_x))
-                            dst[real_x] = op(src1[real_x], src2[real_x]);
-                    }
-                }
-            }
-        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
-            const Mask mask, const BinOp op)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < src1.cols && y < src1.rows && mask(y, x))
-            {
-                const T1 src1_data = src1.ptr(y)[x];
-                const T2 src2_data = src2.ptr(y)[x];
-                dst.ptr(y)[x] = op(src1_data, src2_data);
-            }
-        }
-
-        template <bool UseSmart> struct TransformDispatcher;
-        template<> struct TransformDispatcher<false>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<UnOp> ft;
-
-                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
-                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
-
-                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<BinOp> ft;
-
-                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
-                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
-
-                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-        template<> struct TransformDispatcher<true>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<UnOp> ft;
-
-                StaticAssert<ft::smart_shift != 1>::check();
-
-                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
-                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
-                {
-                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
-                    return;
-                }
-
-                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
-                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
-
-                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<BinOp> ft;
-
-                StaticAssert<ft::smart_shift != 1>::check();
-
-                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
-                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
-                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
-                {
-                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
-                    return;
-                }
-
-                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
-                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
-
-                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-    } // namespace transform_detail
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+#define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+#include "../functional.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace transform_detail
+    {
+        //! Read Write Traits
+
+        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+        {
+            typedef typename TypeVec<T, shift>::vec_type read_type;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+        {
+            typedef typename TypeVec<T1, shift>::vec_type read_type1;
+            typedef typename TypeVec<T2, shift>::vec_type read_type2;
+            typedef typename TypeVec<D, shift>::vec_type write_type;
+        };
+
+        //! Transform kernels
+
+        template <int shift> struct OpUnroller;
+        template <> struct OpUnroller<1>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+            }
+        };
+        template <> struct OpUnroller<2>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+            }
+        };
+        template <> struct OpUnroller<3>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+            }
+        };
+        template <> struct OpUnroller<4>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src.w);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.x = op(src1.x, src2.x);
+                if (mask(y, x_shifted + 1))
+                    dst.y = op(src1.y, src2.y);
+                if (mask(y, x_shifted + 2))
+                    dst.z = op(src1.z, src2.z);
+                if (mask(y, x_shifted + 3))
+                    dst.w = op(src1.w, src2.w);
+            }
+        };
+        template <> struct OpUnroller<8>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src.a7);
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+            {
+                if (mask(y, x_shifted))
+                    dst.a0 = op(src1.a0, src2.a0);
+                if (mask(y, x_shifted + 1))
+                    dst.a1 = op(src1.a1, src2.a1);
+                if (mask(y, x_shifted + 2))
+                    dst.a2 = op(src1.a2, src2.a2);
+                if (mask(y, x_shifted + 3))
+                    dst.a3 = op(src1.a3, src2.a3);
+                if (mask(y, x_shifted + 4))
+                    dst.a4 = op(src1.a4, src2.a4);
+                if (mask(y, x_shifted + 5))
+                    dst.a5 = op(src1.a5, src2.a5);
+                if (mask(y, x_shifted + 6))
+                    dst.a6 = op(src1.a6, src2.a6);
+                if (mask(y, x_shifted + 7))
+                    dst.a7 = op(src1.a7, src2.a7);
+            }
+        };
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src_.rows)
+            {
+                const T* src = src_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src_.cols)
+                {
+                    const read_type src_n_el = ((const read_type*)src)[x];
+                    write_type dst_n_el = ((const write_type*)dst)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                    ((write_type*)dst)[x] = dst_n_el;
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T, typename D, typename UnOp, typename Mask>
+        __global__ static void transformSimple(const PtrStepSz<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows && mask(y, x))
+            {
+                dst.ptr(y)[x] = op(src.ptr(y)[x]);
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSmart(const PtrStepSz<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_,
+            const Mask mask, const BinOp op)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+            const int x = threadIdx.x + blockIdx.x * blockDim.x;
+            const int y = threadIdx.y + blockIdx.y * blockDim.y;
+            const int x_shifted = x * ft::smart_shift;
+
+            if (y < src1_.rows)
+            {
+                const T1* src1 = src1_.ptr(y);
+                const T2* src2 = src2_.ptr(y);
+                D* dst = dst_.ptr(y);
+
+                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+                {
+                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                    write_type dst_n_el = ((const write_type*)dst)[x];
+
+                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                    ((write_type*)dst)[x] = dst_n_el;
+                }
+                else
+                {
+                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                    {
+                        if (mask(y, real_x))
+                            dst[real_x] = op(src1[real_x], src2[real_x]);
+                    }
+                }
+            }
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __global__ void transformSimple(const PtrStepSz<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst,
+            const Mask mask, const BinOp op)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < src1.cols && y < src1.rows && mask(y, x))
+            {
+                const T1 src1_data = src1.ptr(y)[x];
+                const T2 src2_data = src2.ptr(y)[x];
+                dst.ptr(y)[x] = op(src1_data, src2_data);
+            }
+        }
+
+        template <bool UseSmart> struct TransformDispatcher;
+        template<> struct TransformDispatcher<false>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);
+
+                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);
+
+                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+        template<> struct TransformDispatcher<true>
+        {
+            template <typename T, typename D, typename UnOp, typename Mask>
+            static void call(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<UnOp> ft;
+
+                StaticAssert<ft::smart_shift != 1>::check();
+
+                if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);
+
+                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+
+            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+            static void call(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, Mask mask, cudaStream_t stream)
+            {
+                typedef TransformFunctorTraits<BinOp> ft;
+
+                StaticAssert<ft::smart_shift != 1>::check();
+
+                if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) ||
+                    !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) ||
+                    !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D)))
+                {
+                    TransformDispatcher<false>::call(src1, src2, dst, op, mask, stream);
+                    return;
+                }
+
+                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);
+
+                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+    } // namespace transform_detail
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/type_traits_detail.hpp
@@ -1,187 +1,187 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
-#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
-
-#include "../common.hpp"
-#include "../vec_traits.hpp"
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace type_traits_detail
-    {
-        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
-        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
-
-        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
-        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
-
-        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
-        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
-
-        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
-        template <> struct IsIntegral<char> { enum {value = 1}; };
-        template <> struct IsIntegral<bool> { enum {value = 1}; };
-
-        template <typename T> struct IsFloat { enum {value = 0}; };
-        template <> struct IsFloat<float> { enum {value = 1}; };
-        template <> struct IsFloat<double> { enum {value = 1}; };
-
-        template <typename T> struct IsVec { enum {value = 0}; };
-        template <> struct IsVec<uchar1> { enum {value = 1}; };
-        template <> struct IsVec<uchar2> { enum {value = 1}; };
-        template <> struct IsVec<uchar3> { enum {value = 1}; };
-        template <> struct IsVec<uchar4> { enum {value = 1}; };
-        template <> struct IsVec<uchar8> { enum {value = 1}; };
-        template <> struct IsVec<char1> { enum {value = 1}; };
-        template <> struct IsVec<char2> { enum {value = 1}; };
-        template <> struct IsVec<char3> { enum {value = 1}; };
-        template <> struct IsVec<char4> { enum {value = 1}; };
-        template <> struct IsVec<char8> { enum {value = 1}; };
-        template <> struct IsVec<ushort1> { enum {value = 1}; };
-        template <> struct IsVec<ushort2> { enum {value = 1}; };
-        template <> struct IsVec<ushort3> { enum {value = 1}; };
-        template <> struct IsVec<ushort4> { enum {value = 1}; };
-        template <> struct IsVec<ushort8> { enum {value = 1}; };
-        template <> struct IsVec<short1> { enum {value = 1}; };
-        template <> struct IsVec<short2> { enum {value = 1}; };
-        template <> struct IsVec<short3> { enum {value = 1}; };
-        template <> struct IsVec<short4> { enum {value = 1}; };
-        template <> struct IsVec<short8> { enum {value = 1}; };
-        template <> struct IsVec<uint1> { enum {value = 1}; };
-        template <> struct IsVec<uint2> { enum {value = 1}; };
-        template <> struct IsVec<uint3> { enum {value = 1}; };
-        template <> struct IsVec<uint4> { enum {value = 1}; };
-        template <> struct IsVec<uint8> { enum {value = 1}; };
-        template <> struct IsVec<int1> { enum {value = 1}; };
-        template <> struct IsVec<int2> { enum {value = 1}; };
-        template <> struct IsVec<int3> { enum {value = 1}; };
-        template <> struct IsVec<int4> { enum {value = 1}; };
-        template <> struct IsVec<int8> { enum {value = 1}; };
-        template <> struct IsVec<float1> { enum {value = 1}; };
-        template <> struct IsVec<float2> { enum {value = 1}; };
-        template <> struct IsVec<float3> { enum {value = 1}; };
-        template <> struct IsVec<float4> { enum {value = 1}; };
-        template <> struct IsVec<float8> { enum {value = 1}; };
-        template <> struct IsVec<double1> { enum {value = 1}; };
-        template <> struct IsVec<double2> { enum {value = 1}; };
-        template <> struct IsVec<double3> { enum {value = 1}; };
-        template <> struct IsVec<double4> { enum {value = 1}; };
-        template <> struct IsVec<double8> { enum {value = 1}; };
-
-        template <class U> struct AddParameterType { typedef const U& type; };
-        template <class U> struct AddParameterType<U&> { typedef U& type; };
-        template <> struct AddParameterType<void> { typedef void type; };
-
-        template <class U> struct ReferenceTraits 
-        {
-            enum { value = false };
-            typedef U type;
-        };        
-        template <class U> struct ReferenceTraits<U&>
-        {
-            enum { value = true };
-            typedef U type;
-        };
-               
-        template <class U> struct PointerTraits
-        {
-            enum { value = false };
-            typedef void type;
-        };        
-        template <class U> struct PointerTraits<U*>
-        {
-            enum { value = true };
-            typedef U type;
-        };        
-        template <class U> struct PointerTraits<U*&>
-        {
-            enum { value = true };
-            typedef U type;
-        };
-         
-        template <class U> struct UnConst
-        {
-            typedef U type;
-            enum { value = 0 };
-        };        
-        template <class U> struct UnConst<const U>
-        {
-            typedef U type;
-            enum { value = 1 };
-        };
-        template <class U> struct UnConst<const U&>
-        {
-            typedef U& type;
-            enum { value = 1 };
-        };
-
-        template <class U> struct UnVolatile
-        {
-            typedef U type;
-            enum { value = 0 };
-        };       
-        template <class U> struct UnVolatile<volatile U>
-        {
-            typedef U type;
-            enum { value = 1 };
-        };
-        template <class U> struct UnVolatile<volatile U&>
-        {
-            typedef U& type;
-            enum { value = 1 };
-        };
-    } // namespace type_traits_detail
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
+
+#include "../common.hpp"
+#include "../vec_traits.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace type_traits_detail
+    {
+        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+        template <> struct IsIntegral<char> { enum {value = 1}; };
+        template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+        template <typename T> struct IsFloat { enum {value = 0}; };
+        template <> struct IsFloat<float> { enum {value = 1}; };
+        template <> struct IsFloat<double> { enum {value = 1}; };
+
+        template <typename T> struct IsVec { enum {value = 0}; };
+        template <> struct IsVec<uchar1> { enum {value = 1}; };
+        template <> struct IsVec<uchar2> { enum {value = 1}; };
+        template <> struct IsVec<uchar3> { enum {value = 1}; };
+        template <> struct IsVec<uchar4> { enum {value = 1}; };
+        template <> struct IsVec<uchar8> { enum {value = 1}; };
+        template <> struct IsVec<char1> { enum {value = 1}; };
+        template <> struct IsVec<char2> { enum {value = 1}; };
+        template <> struct IsVec<char3> { enum {value = 1}; };
+        template <> struct IsVec<char4> { enum {value = 1}; };
+        template <> struct IsVec<char8> { enum {value = 1}; };
+        template <> struct IsVec<ushort1> { enum {value = 1}; };
+        template <> struct IsVec<ushort2> { enum {value = 1}; };
+        template <> struct IsVec<ushort3> { enum {value = 1}; };
+        template <> struct IsVec<ushort4> { enum {value = 1}; };
+        template <> struct IsVec<ushort8> { enum {value = 1}; };
+        template <> struct IsVec<short1> { enum {value = 1}; };
+        template <> struct IsVec<short2> { enum {value = 1}; };
+        template <> struct IsVec<short3> { enum {value = 1}; };
+        template <> struct IsVec<short4> { enum {value = 1}; };
+        template <> struct IsVec<short8> { enum {value = 1}; };
+        template <> struct IsVec<uint1> { enum {value = 1}; };
+        template <> struct IsVec<uint2> { enum {value = 1}; };
+        template <> struct IsVec<uint3> { enum {value = 1}; };
+        template <> struct IsVec<uint4> { enum {value = 1}; };
+        template <> struct IsVec<uint8> { enum {value = 1}; };
+        template <> struct IsVec<int1> { enum {value = 1}; };
+        template <> struct IsVec<int2> { enum {value = 1}; };
+        template <> struct IsVec<int3> { enum {value = 1}; };
+        template <> struct IsVec<int4> { enum {value = 1}; };
+        template <> struct IsVec<int8> { enum {value = 1}; };
+        template <> struct IsVec<float1> { enum {value = 1}; };
+        template <> struct IsVec<float2> { enum {value = 1}; };
+        template <> struct IsVec<float3> { enum {value = 1}; };
+        template <> struct IsVec<float4> { enum {value = 1}; };
+        template <> struct IsVec<float8> { enum {value = 1}; };
+        template <> struct IsVec<double1> { enum {value = 1}; };
+        template <> struct IsVec<double2> { enum {value = 1}; };
+        template <> struct IsVec<double3> { enum {value = 1}; };
+        template <> struct IsVec<double4> { enum {value = 1}; };
+        template <> struct IsVec<double8> { enum {value = 1}; };
+
+        template <class U> struct AddParameterType { typedef const U& type; };
+        template <class U> struct AddParameterType<U&> { typedef U& type; };
+        template <> struct AddParameterType<void> { typedef void type; };
+
+        template <class U> struct ReferenceTraits
+        {
+            enum { value = false };
+            typedef U type;
+        };
+        template <class U> struct ReferenceTraits<U&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct PointerTraits
+        {
+            enum { value = false };
+            typedef void type;
+        };
+        template <class U> struct PointerTraits<U*>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+        template <class U> struct PointerTraits<U*&>
+        {
+            enum { value = true };
+            typedef U type;
+        };
+
+        template <class U> struct UnConst
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnConst<const U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnConst<const U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+
+        template <class U> struct UnVolatile
+        {
+            typedef U type;
+            enum { value = 0 };
+        };
+        template <class U> struct UnVolatile<volatile U>
+        {
+            typedef U type;
+            enum { value = 1 };
+        };
+        template <class U> struct UnVolatile<volatile U&>
+        {
+            typedef U& type;
+            enum { value = 1 };
+        };
+    } // namespace type_traits_detail
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/detail/vec_distance_detail.hpp
@@ -1,117 +1,117 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
-#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
-
-#include "../datamov_utils.hpp"
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace vec_distance_detail
-    {
-        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
-            {
-                if (ind < len)
-                {
-                    T1 val1 = *vecCached++;
-
-                    T2 val2;
-                    ForceGlob<T2>::Load(vecGlob, ind, val2);
-
-                    dist.reduceIter(val1, val2);
-
-                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
-                }
-            }
-
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
-            {
-                T1 val1 = *vecCached++;
-
-                T2 val2;
-                ForceGlob<T2>::Load(vecGlob, 0, val2);
-                vecGlob += THREAD_DIM;
-
-                dist.reduceIter(val1, val2);
-
-                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
-            }
-        };
-        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
-            {
-            }
-
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
-            {
-            }
-        };
-
-        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
-            }
-        };
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
-            }
-        };
-    } // namespace vec_distance_detail
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
+
+#include "../datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace vec_distance_detail
+    {
+        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
+            {
+                if (ind < len)
+                {
+                    T1 val1 = *vecCached++;
+
+                    T2 val2;
+                    ForceGlob<T2>::Load(vecGlob, ind, val2);
+
+                    dist.reduceIter(val1, val2);
+
+                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
+                }
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            {
+                T1 val1 = *vecCached++;
+
+                T2 val2;
+                ForceGlob<T2>::Load(vecGlob, 0, val2);
+                vecGlob += THREAD_DIM;
+
+                dist.reduceIter(val1, val2);
+
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+            }
+        };
+        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
+            {
+            }
+
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
+            {
+            }
+        };
+
+        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+            }
+        };
+        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        {
+            template <typename Dist, typename T1, typename T2>
+            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+            {
+                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+            }
+        };
+    } // namespace vec_distance_detail
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/dynamic_smem.hpp
@@ -1,80 +1,80 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
-#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
-
-namespace cv { namespace gpu { namespace device
-{
-    template<class T> struct DynamicSharedMem
-    {
-        __device__ __forceinline__ operator T*()
-        {
-            extern __shared__ int __smem[];
-            return (T*)__smem;
-        }
-
-        __device__ __forceinline__ operator const T*() const
-        {
-            extern __shared__ int __smem[];
-            return (T*)__smem;
-        }
-    };
-
-    // specialize for double to avoid unaligned memory access compile errors
-    template<> struct DynamicSharedMem<double>
-    {
-        __device__ __forceinline__ operator double*()
-        {
-            extern __shared__ double __smem_d[];
-            return (double*)__smem_d;
-        }
-
-        __device__ __forceinline__ operator const double*() const
-        {
-            extern __shared__ double __smem_d[];
-            return (double*)__smem_d;
-        }
-    };
-}}}
-
-#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    template<class T> struct DynamicSharedMem
+    {
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}
+
+#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/filters.hpp
@@ -1,278 +1,278 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_FILTERS_HPP__
-#define __OPENCV_GPU_FILTERS_HPP__
-
-#include "saturate_cast.hpp"
-#include "vec_traits.hpp"
-#include "vec_math.hpp"
-#include "type_traits.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template <typename Ptr2D> struct PointFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
-        : src(src_)
-        {
-            (void)fx;
-            (void)fy;
-        }
-
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            return src(__float2int_rz(y), __float2int_rz(x));
-        }
-
-        const Ptr2D src;
-    };
-
-    template <typename Ptr2D> struct LinearFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
-        : src(src_)
-        {
-            (void)fx;
-            (void)fy;
-        }
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
-
-            work_type out = VecTraits<work_type>::all(0);
-
-            const int x1 = __float2int_rd(x);
-            const int y1 = __float2int_rd(y);
-            const int x2 = x1 + 1;
-            const int y2 = y1 + 1;
-
-            elem_type src_reg = src(y1, x1);
-            out = out + src_reg * ((x2 - x) * (y2 - y));
-
-            src_reg = src(y1, x2);
-            out = out + src_reg * ((x - x1) * (y2 - y));
-
-            src_reg = src(y2, x1);
-            out = out + src_reg * ((x2 - x) * (y - y1));
-
-            src_reg = src(y2, x2);
-            out = out + src_reg * ((x - x1) * (y - y1));
-
-            return saturate_cast<elem_type>(out);
-        }
-
-        const Ptr2D src;
-    };
-
-    template <typename Ptr2D> struct CubicFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
-
-        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f) 
-        : src(src_)
-        {
-            (void)fx;
-            (void)fy;
-        }
-
-        static __device__ __forceinline__ float bicubicCoeff(float x_)
-        {
-            float x = fabsf(x_);
-            if (x <= 1.0f)
-            {
-                return x * x * (1.5f * x - 2.5f) + 1.0f;
-            }
-            else if (x < 2.0f)
-            {
-                return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-            }
-            else
-            {
-                return 0.0f;
-            }
-        }
-
-        __device__ elem_type operator ()(float y, float x) const
-        {
-            const float xmin = ::ceilf(x - 2.0f);
-            const float xmax = ::floorf(x + 2.0f);
-
-            const float ymin = ::ceilf(y - 2.0f);
-            const float ymax = ::floorf(y + 2.0f);
-
-            work_type sum = VecTraits<work_type>::all(0);
-            float wsum = 0.0f;
-
-            for (float cy = ymin; cy <= ymax; cy += 1.0f)
-            {
-                for (float cx = xmin; cx <= xmax; cx += 1.0f)
-                {
-                    const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
-                    sum = sum + w * src(__float2int_rd(cy), __float2int_rd(cx));
-                    wsum += w;
-                }
-            }
-
-            work_type res = (!wsum)? VecTraits<work_type>::all(0) : sum / wsum;
-
-            return saturate_cast<elem_type>(res);
-        }
-
-        const Ptr2D src;
-    };
-    // for integer scaling
-    template <typename Ptr2D> struct IntegerAreaFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ IntegerAreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
-            : src(src_), scale_x(scale_x_), scale_y(scale_y_), scale(1.f / (scale_x * scale_y)) {}
-
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            float fsx1 = x * scale_x;
-            float fsx2 = fsx1 + scale_x;
-
-            int sx1 = __float2int_ru(fsx1);
-            int sx2 = __float2int_rd(fsx2);
-
-            float fsy1 = y * scale_y;
-            float fsy2 = fsy1 + scale_y;
-
-            int sy1 = __float2int_ru(fsy1);
-            int sy2 = __float2int_rd(fsy2);
-
-            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
-            work_type out = VecTraits<work_type>::all(0.f);
-
-            for(int dy = sy1; dy < sy2; ++dy)
-                for(int dx = sx1; dx < sx2; ++dx)
-                {
-                    out = out + src(dy, dx) * scale;
-                }
-
-            return saturate_cast<elem_type>(out);
-        }
-
-        const Ptr2D src;
-        float scale_x, scale_y ,scale;
-    };
-
-    template <typename Ptr2D> struct AreaFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ AreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
-            : src(src_), scale_x(scale_x_), scale_y(scale_y_){}
-
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            float fsx1 = x * scale_x;
-            float fsx2 = fsx1 + scale_x;
-
-            int sx1 = __float2int_ru(fsx1);
-            int sx2 = __float2int_rd(fsx2);
-
-            float fsy1 = y * scale_y;
-            float fsy2 = fsy1 + scale_y;
-
-            int sy1 = __float2int_ru(fsy1);
-            int sy2 = __float2int_rd(fsy2);
-
-            float scale = 1.f / (fminf(scale_x, src.width - fsx1) * fminf(scale_y, src.height - fsy1));
-
-            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
-            work_type out = VecTraits<work_type>::all(0.f);
-
-            for (int dy = sy1; dy < sy2; ++dy)
-            {
-                for (int dx = sx1; dx < sx2; ++dx)
-                    out = out + src(dy, dx) * scale;
-
-                if (sx1 > fsx1)
-                    out = out + src(dy, (sx1 -1) ) * ((sx1 - fsx1) * scale);
-
-                if (sx2 < fsx2)
-                    out = out + src(dy, sx2) * ((fsx2 -sx2) * scale);
-            }
-
-            if (sy1 > fsy1)
-                for (int dx = sx1; dx < sx2; ++dx)
-                    out = out + src( (sy1 - 1) , dx) * ((sy1 -fsy1) * scale);
-
-            if (sy2 < fsy2)
-                for (int dx = sx1; dx < sx2; ++dx)
-                    out = out + src(sy2, dx) * ((fsy2 -sy2) * scale);
-
-            if ((sy1 > fsy1) &&  (sx1 > fsx1))
-                out = out + src( (sy1 - 1) , (sx1 - 1)) * ((sy1 -fsy1) * (sx1 -fsx1) * scale);
-
-            if ((sy1 > fsy1) &&  (sx2 < fsx2))
-                out = out + src( (sy1 - 1) , sx2) * ((sy1 -fsy1) * (fsx2 -sx2) * scale);
-
-            if ((sy2 < fsy2) &&  (sx2 < fsx2))
-                out = out + src(sy2, sx2) * ((fsy2 -sy2) * (fsx2 -sx2) * scale);
-
-            if ((sy2 < fsy2) &&  (sx1 > fsx1))
-                out = out + src(sy2, (sx1 - 1)) * ((fsy2 -sy2) * (sx1 -fsx1) * scale);
-
-            return saturate_cast<elem_type>(out);
-        }
-
-        const Ptr2D src;
-        float scale_x, scale_y;
-        int width, haight;
-    };
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_FILTERS_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_FILTERS_HPP__
+#define __OPENCV_GPU_FILTERS_HPP__
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "vec_math.hpp"
+#include "type_traits.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            (void)fx;
+            (void)fy;
+        }
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rz(y), __float2int_rz(x));
+        }
+
+        const Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            (void)fx;
+            (void)fy;
+        }
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+            work_type out = VecTraits<work_type>::all(0);
+
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        const Ptr2D src;
+    };
+
+    template <typename Ptr2D> struct CubicFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_, float fx = 0.f, float fy = 0.f)
+        : src(src_)
+        {
+            (void)fx;
+            (void)fy;
+        }
+
+        static __device__ __forceinline__ float bicubicCoeff(float x_)
+        {
+            float x = fabsf(x_);
+            if (x <= 1.0f)
+            {
+                return x * x * (1.5f * x - 2.5f) + 1.0f;
+            }
+            else if (x < 2.0f)
+            {
+                return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
+            }
+            else
+            {
+                return 0.0f;
+            }
+        }
+
+        __device__ elem_type operator ()(float y, float x) const
+        {
+            const float xmin = ::ceilf(x - 2.0f);
+            const float xmax = ::floorf(x + 2.0f);
+
+            const float ymin = ::ceilf(y - 2.0f);
+            const float ymax = ::floorf(y + 2.0f);
+
+            work_type sum = VecTraits<work_type>::all(0);
+            float wsum = 0.0f;
+
+            for (float cy = ymin; cy <= ymax; cy += 1.0f)
+            {
+                for (float cx = xmin; cx <= xmax; cx += 1.0f)
+                {
+                    const float w = bicubicCoeff(x - cx) * bicubicCoeff(y - cy);
+                    sum = sum + w * src(__float2int_rd(cy), __float2int_rd(cx));
+                    wsum += w;
+                }
+            }
+
+            work_type res = (!wsum)? VecTraits<work_type>::all(0) : sum / wsum;
+
+            return saturate_cast<elem_type>(res);
+        }
+
+        const Ptr2D src;
+    };
+    // for integer scaling
+    template <typename Ptr2D> struct IntegerAreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ IntegerAreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_), scale(1.f / (scale_x * scale_y)) {}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for(int dy = sy1; dy < sy2; ++dy)
+                for(int dx = sx1; dx < sx2; ++dx)
+                {
+                    out = out + src(dy, dx) * scale;
+                }
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        const Ptr2D src;
+        float scale_x, scale_y ,scale;
+    };
+
+    template <typename Ptr2D> struct AreaFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+
+        explicit __host__ __device__ __forceinline__ AreaFilter(const Ptr2D& src_, float scale_x_, float scale_y_)
+            : src(src_), scale_x(scale_x_), scale_y(scale_y_){}
+
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            float fsx1 = x * scale_x;
+            float fsx2 = fsx1 + scale_x;
+
+            int sx1 = __float2int_ru(fsx1);
+            int sx2 = __float2int_rd(fsx2);
+
+            float fsy1 = y * scale_y;
+            float fsy2 = fsy1 + scale_y;
+
+            int sy1 = __float2int_ru(fsy1);
+            int sy2 = __float2int_rd(fsy2);
+
+            float scale = 1.f / (fminf(scale_x, src.width - fsx1) * fminf(scale_y, src.height - fsy1));
+
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0.f);
+
+            for (int dy = sy1; dy < sy2; ++dy)
+            {
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(dy, dx) * scale;
+
+                if (sx1 > fsx1)
+                    out = out + src(dy, (sx1 -1) ) * ((sx1 - fsx1) * scale);
+
+                if (sx2 < fsx2)
+                    out = out + src(dy, sx2) * ((fsx2 -sx2) * scale);
+            }
+
+            if (sy1 > fsy1)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src( (sy1 - 1) , dx) * ((sy1 -fsy1) * scale);
+
+            if (sy2 < fsy2)
+                for (int dx = sx1; dx < sx2; ++dx)
+                    out = out + src(sy2, dx) * ((fsy2 -sy2) * scale);
+
+            if ((sy1 > fsy1) &&  (sx1 > fsx1))
+                out = out + src( (sy1 - 1) , (sx1 - 1)) * ((sy1 -fsy1) * (sx1 -fsx1) * scale);
+
+            if ((sy1 > fsy1) &&  (sx2 < fsx2))
+                out = out + src( (sy1 - 1) , sx2) * ((sy1 -fsy1) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx2 < fsx2))
+                out = out + src(sy2, sx2) * ((fsy2 -sy2) * (fsx2 -sx2) * scale);
+
+            if ((sy2 < fsy2) &&  (sx1 > fsx1))
+                out = out + src(sy2, (sx1 - 1)) * ((fsy2 -sy2) * (sx1 -fsx1) * scale);
+
+            return saturate_cast<elem_type>(out);
+        }
+
+        const Ptr2D src;
+        float scale_x, scale_y;
+        int width, haight;
+    };
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_FILTERS_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/funcattrib.hpp
@@ -1,72 +1,72 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
-#define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
-
-#include <cstdio>
-
-namespace cv { namespace gpu { namespace device
-{
-    template<class Func>
-    void printFuncAttrib(Func& func)
-    {
-
-        cudaFuncAttributes attrs;
-        cudaFuncGetAttributes(&attrs, func);
-
-        printf("=== Function stats ===\n");
-        printf("Name: \n");
-        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
-        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
-        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
-        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
-        printf("numRegs            = %d\n", attrs.numRegs);
-        printf("ptxVersion         = %d\n", attrs.ptxVersion);
-        printf("binaryVersion      = %d\n", attrs.binaryVersion);
-        printf("\n");
-        fflush(stdout);
-    }
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
+#define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
+
+#include <cstdio>
+
+namespace cv { namespace gpu { namespace device
+{
+    template<class Func>
+    void printFuncAttrib(Func& func)
+    {
+
+        cudaFuncAttributes attrs;
+        cudaFuncGetAttributes(&attrs, func);
+
+        printf("=== Function stats ===\n");
+        printf("Name: \n");
+        printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
+        printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
+        printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
+        printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
+        printf("numRegs            = %d\n", attrs.numRegs);
+        printf("ptxVersion         = %d\n", attrs.ptxVersion);
+        printf("binaryVersion      = %d\n", attrs.binaryVersion);
+        printf("\n");
+        fflush(stdout);
+    }
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif  /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
--- a/modules/gpu/include/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/limits.hpp
@@ -1,235 +1,235 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
-#define __OPENCV_GPU_LIMITS_GPU_HPP__
-
-#include <limits>
-#include "common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template<class T> struct numeric_limits
-    {
-        typedef T type;
-        __device__ __forceinline__ static type min()  { return type(); };
-        __device__ __forceinline__ static type max() { return type(); };
-        __device__ __forceinline__ static type epsilon() { return type(); }
-        __device__ __forceinline__ static type round_error() { return type(); }
-        __device__ __forceinline__ static type denorm_min()  { return type(); }
-        __device__ __forceinline__ static type infinity() { return type(); }
-        __device__ __forceinline__ static type quiet_NaN() { return type(); }
-        __device__ __forceinline__ static type signaling_NaN() { return T(); }
-        static const bool is_signed;
-    };
-
-    template<> struct numeric_limits<bool>
-    {
-        typedef bool type;
-        __device__ __forceinline__ static type min() { return false; };
-        __device__ __forceinline__ static type max() { return true;  };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (char)-1 == -1;
-    };
-
-    template<> struct numeric_limits<signed char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return SCHAR_MIN; };
-        __device__ __forceinline__ static type max() { return SCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (signed char)-1 == -1;
-    };
-
-    template<> struct numeric_limits<unsigned char>
-    {
-        typedef unsigned char type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<short>
-    {
-        typedef short type;
-        __device__ __forceinline__ static type min() { return SHRT_MIN; };
-        __device__ __forceinline__ static type max() { return SHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-    template<> struct numeric_limits<unsigned short>
-    {
-        typedef unsigned short type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return USHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<int>
-    {
-        typedef int type;
-        __device__ __forceinline__ static type min() { return INT_MIN; };
-        __device__ __forceinline__ static type max() { return INT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-
-    template<> struct numeric_limits<unsigned int>
-    {
-        typedef unsigned int type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UINT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<long>
-    {
-        typedef long type;
-        __device__ __forceinline__ static type min() { return LONG_MIN; };
-        __device__ __forceinline__ static type max() { return LONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-    template<> struct numeric_limits<unsigned long>
-    {
-        typedef unsigned long type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return ULONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
-
-    template<> struct numeric_limits<float>
-    {
-        typedef float type;
-        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
-        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-
-    template<> struct numeric_limits<double>
-    {
-        typedef double type;
-        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
-        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-}}} // namespace cv { namespace gpu { namespace device {
-
-#endif // __OPENCV_GPU_LIMITS_GPU_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
+#define __OPENCV_GPU_LIMITS_GPU_HPP__
+
+#include <limits>
+#include "common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template<class T> struct numeric_limits
+    {
+        typedef T type;
+        __device__ __forceinline__ static type min()  { return type(); };
+        __device__ __forceinline__ static type max() { return type(); };
+        __device__ __forceinline__ static type epsilon() { return type(); }
+        __device__ __forceinline__ static type round_error() { return type(); }
+        __device__ __forceinline__ static type denorm_min()  { return type(); }
+        __device__ __forceinline__ static type infinity() { return type(); }
+        __device__ __forceinline__ static type quiet_NaN() { return type(); }
+        __device__ __forceinline__ static type signaling_NaN() { return T(); }
+        static const bool is_signed;
+    };
+
+    template<> struct numeric_limits<bool>
+    {
+        typedef bool type;
+        __device__ __forceinline__ static type min() { return false; };
+        __device__ __forceinline__ static type max() { return true;  };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<char>
+    {
+        typedef char type;
+        __device__ __forceinline__ static type min() { return CHAR_MIN; };
+        __device__ __forceinline__ static type max() { return CHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = (char)-1 == -1;
+    };
+
+    template<> struct numeric_limits<signed char>
+    {
+        typedef char type;
+        __device__ __forceinline__ static type min() { return SCHAR_MIN; };
+        __device__ __forceinline__ static type max() { return SCHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = (signed char)-1 == -1;
+    };
+
+    template<> struct numeric_limits<unsigned char>
+    {
+        typedef unsigned char type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<short>
+    {
+        typedef short type;
+        __device__ __forceinline__ static type min() { return SHRT_MIN; };
+        __device__ __forceinline__ static type max() { return SHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+    template<> struct numeric_limits<unsigned short>
+    {
+        typedef unsigned short type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return USHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<int>
+    {
+        typedef int type;
+        __device__ __forceinline__ static type min() { return INT_MIN; };
+        __device__ __forceinline__ static type max() { return INT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+
+    template<> struct numeric_limits<unsigned int>
+    {
+        typedef unsigned int type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return UINT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<long>
+    {
+        typedef long type;
+        __device__ __forceinline__ static type min() { return LONG_MIN; };
+        __device__ __forceinline__ static type max() { return LONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+    template<> struct numeric_limits<unsigned long>
+    {
+        typedef unsigned long type;
+        __device__ __forceinline__ static type min() { return 0; };
+        __device__ __forceinline__ static type max() { return ULONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+
+    template<> struct numeric_limits<float>
+    {
+        typedef float type;
+        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
+        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
+        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+
+    template<> struct numeric_limits<double>
+    {
+        typedef double type;
+        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
+        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };
+}}} // namespace cv { namespace gpu { namespace device {
+
+#endif // __OPENCV_GPU_LIMITS_GPU_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/saturate_cast.hpp
@@ -1,216 +1,216 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
-#define __OPENCV_GPU_SATURATE_CAST_HPP__
-
-#include "common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
-
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
-    {
-        return (uchar) ::max((int)v, 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-    {
-        return saturate_cast<uchar>((uint)v);
-    }
-
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
-    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<uchar>(iv);
-    }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
-    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<uchar>(iv);
-    #else
-        return saturate_cast<uchar>((float)v);
-    #endif
-    }
-
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
-    {
-        return (schar) ::min((int)v, SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
-    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
-    {
-        return saturate_cast<schar>((int)v);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
-    {
-        return (schar) ::min(v, (uint)SCHAR_MAX);
-    }
-
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
-    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<schar>(iv);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
-    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<schar>(iv);
-    #else
-        return saturate_cast<schar>((float)v);
-    #endif
-    }
-
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
-    {
-        return (ushort) ::max((int)v, 0);
-    }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
-    {
-        return (ushort) ::max((int)v, 0);
-    }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
-    {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
-    }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
-    {
-        return (ushort) ::min(v, (uint)USHRT_MAX);
-    }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
-    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<ushort>(iv);
-    }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
-    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<ushort>(iv);
-    #else
-        return saturate_cast<ushort>((float)v);
-    #endif
-    }
-
-    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
-    {
-        return (short) ::min((int)v, SHRT_MAX);
-    }
-    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
-    {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
-    }
-    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
-    {
-        return (short) ::min(v, (uint)SHRT_MAX);
-    }
-    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
-    {
-        int iv = __float2int_rn(v);
-        return saturate_cast<short>(iv);
-    }
-    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
-    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v);
-        return saturate_cast<short>(iv);
-    #else
-        return saturate_cast<short>((float)v);
-    #endif
-    }
-
-    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
-    {
-        return __float2int_rn(v);
-    }
-    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
-    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        return __double2int_rn(v);
-    #else
-        return saturate_cast<int>((float)v);
-    #endif
-    }
-
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
-    {
-        return __float2uint_rn(v);
-    }
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
-    {
-    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        return __double2uint_rn(v);
-    #else
-        return saturate_cast<uint>((float)v);
-    #endif
-    }
-}}}
-
-#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
+#define __OPENCV_GPU_SATURATE_CAST_HPP__
+
+#include "common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
+    {
+        return (uchar) ::max((int)v, 0);
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+    {
+        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+    {
+        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+    {
+        return (uchar) ::min(v, (uint)UCHAR_MAX);
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+    {
+        return saturate_cast<uchar>((uint)v);
+    }
+
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<uchar>(iv);
+    }
+    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v);
+        return saturate_cast<uchar>(iv);
+    #else
+        return saturate_cast<uchar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+    {
+        return (schar) ::min((int)v, SCHAR_MAX);
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+    {
+        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+    {
+        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+    {
+        return saturate_cast<schar>((int)v);
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+    {
+        return (schar) ::min(v, (uint)SCHAR_MAX);
+    }
+
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<schar>(iv);
+    }
+    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v);
+        return saturate_cast<schar>(iv);
+    #else
+        return saturate_cast<schar>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+    {
+        return (ushort) ::max((int)v, 0);
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+    {
+        return (ushort) ::max((int)v, 0);
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+    {
+        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+    {
+        return (ushort) ::min(v, (uint)USHRT_MAX);
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<ushort>(iv);
+    }
+    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v);
+        return saturate_cast<ushort>(iv);
+    #else
+        return saturate_cast<ushort>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+    {
+        return (short) ::min((int)v, SHRT_MAX);
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
+    {
+        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
+    {
+        return (short) ::min(v, (uint)SHRT_MAX);
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
+    {
+        int iv = __float2int_rn(v);
+        return saturate_cast<short>(iv);
+    }
+    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        int iv = __double2int_rn(v);
+        return saturate_cast<short>(iv);
+    #else
+        return saturate_cast<short>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
+    {
+        return __float2int_rn(v);
+    }
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2int_rn(v);
+    #else
+        return saturate_cast<int>((float)v);
+    #endif
+    }
+
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+    {
+        return __float2uint_rn(v);
+    }
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
+    {
+    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
+        return __double2uint_rn(v);
+    #else
+        return saturate_cast<uint>((float)v);
+    #endif
+    }
+}}}
+
+#endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
--- a/modules/gpu/include/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/static_check.hpp
@@ -1,67 +1,67 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
-#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
-
-#if defined(__CUDACC__)
-    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
-#else
-    #define __OPENCV_GPU_HOST_DEVICE__
-#endif
-
-namespace cv { namespace gpu
-{
-    namespace device
-    {
-        template<bool expr> struct Static {};
-
-        template<> struct Static<true>
-        {
-            __OPENCV_GPU_HOST_DEVICE__ static void check() {};
-        };
-    }
-}}
-
-#undef __OPENCV_GPU_HOST_DEVICE__
-
-#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
+#define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
+
+#if defined(__CUDACC__)
+    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
+#else
+    #define __OPENCV_GPU_HOST_DEVICE__
+#endif
+
+namespace cv { namespace gpu
+{
+    namespace device
+    {
+        template<bool expr> struct Static {};
+
+        template<> struct Static<true>
+        {
+            __OPENCV_GPU_HOST_DEVICE__ static void check() {};
+        };
+    }
+}}
+
+#undef __OPENCV_GPU_HOST_DEVICE__
+
+#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
--- a/modules/gpu/include/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/transform.hpp
@@ -1,67 +1,67 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_TRANSFORM_HPP__
-#define __OPENCV_GPU_TRANSFORM_HPP__
-
-#include "common.hpp"
-#include "utility.hpp"
-#include "detail/transform_detail.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template <typename T, typename D, typename UnOp, typename Mask>
-    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
-    {
-        typedef TransformFunctorTraits<UnOp> ft;
-        transform_detail::TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
-    }
-
-    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-    static inline void transform(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
-    {
-        typedef TransformFunctorTraits<BinOp> ft;
-        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
-    }
-}}}
-
-#endif // __OPENCV_GPU_TRANSFORM_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TRANSFORM_HPP__
+#define __OPENCV_GPU_TRANSFORM_HPP__
+
+#include "common.hpp"
+#include "utility.hpp"
+#include "detail/transform_detail.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static inline void transform(PtrStepSz<T> src, PtrStepSz<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<UnOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static inline void transform(PtrStepSz<T1> src1, PtrStepSz<T2> src2, PtrStepSz<D> dst, BinOp op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<BinOp> ft;
+        transform_detail::TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
+    }
+}}}
+
+#endif // __OPENCV_GPU_TRANSFORM_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/type_traits.hpp
@@ -1,82 +1,82 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
-#define __OPENCV_GPU_TYPE_TRAITS_HPP__
-
-#include "detail/type_traits_detail.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template <typename T> struct IsSimpleParameter
-    {
-        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
-            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
-    };
-
-    template <typename T> struct TypeTraits
-    {
-        typedef typename type_traits_detail::UnConst<T>::type                                                NonConstType;
-        typedef typename type_traits_detail::UnVolatile<T>::type                                             NonVolatileType;
-        typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
-        typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type                            PointeeType;
-        typedef typename type_traits_detail::ReferenceTraits<T>::type                                        ReferredType;
-
-        enum { isConst          = type_traits_detail::UnConst<T>::value };
-        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
-
-        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
-        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
-
-        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
-        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
-        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
-        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
-        enum { isArith          = isIntegral || isFloat };
-        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
-
-        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
-            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
-    };
-}}}
-
-#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
+#define __OPENCV_GPU_TYPE_TRAITS_HPP__
+
+#include "detail/type_traits_detail.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct IsSimpleParameter
+    {
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
+            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
+    };
+
+    template <typename T> struct TypeTraits
+    {
+        typedef typename type_traits_detail::UnConst<T>::type                                                NonConstType;
+        typedef typename type_traits_detail::UnVolatile<T>::type                                             NonVolatileType;
+        typedef typename type_traits_detail::UnVolatile<typename type_traits_detail::UnConst<T>::type>::type UnqualifiedType;
+        typedef typename type_traits_detail::PointerTraits<UnqualifiedType>::type                            PointeeType;
+        typedef typename type_traits_detail::ReferenceTraits<T>::type                                        ReferredType;
+
+        enum { isConst          = type_traits_detail::UnConst<T>::value };
+        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
+
+        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
+
+        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
+        enum { isArith          = isIntegral || isFloat };
+        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
+
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
+            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
+    };
+}}}
+
+#endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/utility.hpp
@@ -1,237 +1,237 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_UTILITY_HPP__
-#define __OPENCV_GPU_UTILITY_HPP__
-
-#include "saturate_cast.hpp"
-#include "datamov_utils.hpp"
-#include "detail/reduction_detail.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    #define OPENCV_GPU_LOG_WARP_SIZE        (5)
-    #define OPENCV_GPU_WARP_SIZE            (1 << OPENCV_GPU_LOG_WARP_SIZE)
-    #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
-    #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // swap
-
-    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
-    {
-        const T temp = a;
-        a = b;
-        b = temp;
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Mask Reader
-
-    struct SingleMask
-    {
-        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
-        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
-
-        __device__ __forceinline__ bool operator()(int y, int x) const
-        {
-            return mask.ptr(y)[x] != 0;
-        }
-
-        PtrStepb mask;
-    };
-
-    struct SingleMaskChannels
-    {
-        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
-        : mask(mask_), channels(channels_) {}
-        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
-            :mask(mask_.mask), channels(mask_.channels){}
-
-        __device__ __forceinline__ bool operator()(int y, int x) const
-        {
-            return mask.ptr(y)[x / channels] != 0;
-        }
-
-        PtrStepb mask;
-        int channels;
-    };
-
-    struct MaskCollection
-    {
-        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_)
-            : maskCollection(maskCollection_) {}
-
-        __device__ __forceinline__ MaskCollection(const MaskCollection& masks_)
-            : maskCollection(masks_.maskCollection), curMask(masks_.curMask){}
-
-        __device__ __forceinline__ void next()
-        {
-            curMask = *maskCollection++;
-        }
-        __device__ __forceinline__ void setMask(int z)
-        {
-            curMask = maskCollection[z];
-        }
-
-        __device__ __forceinline__ bool operator()(int y, int x) const
-        {
-            uchar val;
-            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
-        }
-
-        const PtrStepb* maskCollection;
-        PtrStepb curMask;
-    };
-
-    struct WithOutMask
-    {
-        __device__ __forceinline__ WithOutMask(){}
-        __device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
-
-        __device__ __forceinline__ void next() const
-        {
-        }
-        __device__ __forceinline__ void setMask(int) const
-        {
-        }
-
-        __device__ __forceinline__ bool operator()(int, int) const
-        {
-            return true;
-        }
-
-        __device__ __forceinline__ bool operator()(int, int, int) const
-        {
-            return true;
-        }
-
-        static __device__ __forceinline__ bool check(int, int)
-        {
-            return true;
-        }
-
-        static __device__ __forceinline__ bool check(int, int, int, uint offset = 0)
-        {
-            return true;
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
-    }
-
-    template <int n, typename T, typename V, typename Pred>
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
-    }
-
-    template <int n, typename T, typename V1, typename V2, typename Pred>
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
-    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Solve linear system
-
-    // solve 2x2 linear system Ax=b
-    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
-    {
-        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
-
-        if (det != 0)
-        {
-            double invdet = 1.0 / det;
-
-            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
-
-            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
-
-            return true;
-        }
-
-        return false;
-    }
-
-    // solve 3x3 linear system Ax=b
-    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
-    {
-        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
-              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
-              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
-
-        if (det != 0)
-        {
-            double invdet = 1.0 / det;
-
-            x[0] = saturate_cast<T>(invdet *
-                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
-                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
-                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
-
-            x[1] = saturate_cast<T>(invdet *
-                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
-                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
-                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
-
-            x[2] = saturate_cast<T>(invdet *
-                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
-                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
-                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
-
-            return true;
-        }
-
-        return false;
-    }
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_UTILITY_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_UTILITY_HPP__
+#define __OPENCV_GPU_UTILITY_HPP__
+
+#include "saturate_cast.hpp"
+#include "datamov_utils.hpp"
+#include "detail/reduction_detail.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    #define OPENCV_GPU_LOG_WARP_SIZE        (5)
+    #define OPENCV_GPU_WARP_SIZE            (1 << OPENCV_GPU_LOG_WARP_SIZE)
+    #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
+    #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // swap
+
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
+    {
+        const T temp = a;
+        a = b;
+        b = temp;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Mask Reader
+
+    struct SingleMask
+    {
+        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
+        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x] != 0;
+        }
+
+        PtrStepb mask;
+    };
+
+    struct SingleMaskChannels
+    {
+        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
+        : mask(mask_), channels(channels_) {}
+        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
+            :mask(mask_.mask), channels(mask_.channels){}
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            return mask.ptr(y)[x / channels] != 0;
+        }
+
+        PtrStepb mask;
+        int channels;
+    };
+
+    struct MaskCollection
+    {
+        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_)
+            : maskCollection(maskCollection_) {}
+
+        __device__ __forceinline__ MaskCollection(const MaskCollection& masks_)
+            : maskCollection(masks_.maskCollection), curMask(masks_.curMask){}
+
+        __device__ __forceinline__ void next()
+        {
+            curMask = *maskCollection++;
+        }
+        __device__ __forceinline__ void setMask(int z)
+        {
+            curMask = maskCollection[z];
+        }
+
+        __device__ __forceinline__ bool operator()(int y, int x) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
+        }
+
+        const PtrStepb* maskCollection;
+        PtrStepb curMask;
+    };
+
+    struct WithOutMask
+    {
+        __device__ __forceinline__ WithOutMask(){}
+        __device__ __forceinline__ WithOutMask(const WithOutMask& mask){}
+
+        __device__ __forceinline__ void next() const
+        {
+        }
+        __device__ __forceinline__ void setMask(int) const
+        {
+        }
+
+        __device__ __forceinline__ bool operator()(int, int) const
+        {
+            return true;
+        }
+
+        __device__ __forceinline__ bool operator()(int, int, int) const
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int)
+        {
+            return true;
+        }
+
+        static __device__ __forceinline__ bool check(int, int, int, uint offset = 0)
+        {
+            return true;
+        }
+    };
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Reduction
+
+    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+    {
+        StaticAssert<n >= 8 && n <= 512>::check();
+        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
+    }
+
+    template <int n, typename T, typename V, typename Pred>
+    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
+    {
+        StaticAssert<n >= 8 && n <= 512>::check();
+        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
+    }
+
+    template <int n, typename T, typename V1, typename V2, typename Pred>
+    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
+    {
+        StaticAssert<n >= 8 && n <= 512>::check();
+        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Solve linear system
+
+    // solve 2x2 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+    {
+        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+
+            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // solve 3x3 linear system Ax=b
+    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+    {
+        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+        if (det != 0)
+        {
+            double invdet = 1.0 / det;
+
+            x[0] = saturate_cast<T>(invdet *
+                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+
+            x[1] = saturate_cast<T>(invdet *
+                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+
+            x[2] = saturate_cast<T>(invdet *
+                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+
+            return true;
+        }
+
+        return false;
+    }
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_distance.hpp
@@ -1,224 +1,224 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
-#define __OPENCV_GPU_VEC_DISTANCE_HPP__
-
-#include "utility.hpp"
-#include "functional.hpp"
-#include "detail/vec_distance_detail.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template <typename T> struct L1Dist
-    {
-        typedef int value_type;
-        typedef int result_type;
-
-        __device__ __forceinline__ L1Dist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum = __sad(val1, val2, mySum);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-    template <> struct L1Dist<float>
-    {
-        typedef float value_type;
-        typedef float result_type;
-
-        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            mySum += ::fabs(val1 - val2);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return mySum;
-        }
-
-        float mySum;
-    };
-
-    struct L2Dist
-    {
-        typedef float value_type;
-        typedef float result_type;
-
-        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            float reg = val1 - val2;
-            mySum += reg * reg;
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return sqrtf(mySum);
-        }
-
-        float mySum;
-    };
-
-    struct HammingDist
-    {
-        typedef int value_type;
-        typedef int result_type;
-
-        __device__ __forceinline__ HammingDist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum += __popc(val1 ^ val2);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-
-    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
-    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {
-        for (int i = tid; i < len; i += THREAD_DIM)
-        {
-            T1 val1;
-            ForceGlob<T1>::Load(vec1, i, val1);
-
-            T2 val2;
-            ForceGlob<T2>::Load(vec2, i, val2);
-
-            dist.reduceIter(val1, val2);
-        }
-
-        dist.reduceAll<THREAD_DIM>(smem, tid);
-    }
-
-    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
-    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {
-        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
-
-        dist.reduceAll<THREAD_DIM>(smem, tid);
-    }
-
-    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
-    {
-        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
-        {
-            vec1 = vec1_;
-        }
-
-        template <typename T2, typename Dist>
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-        {
-            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
-        }
-
-        const T1* vec1;
-    };
-
-    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
-    {
-        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
-        {
-            if (glob_tid < len)
-                smem[glob_tid] = vec1[glob_tid];
-            __syncthreads();
-
-            U* vec1ValsPtr = vec1Vals;
-
-            #pragma unroll
-            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
-                *vec1ValsPtr++ = smem[i];
-
-            __syncthreads();
-        }
-
-        template <typename T2, typename Dist>
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-        {
-            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
-        }
-
-        U vec1Vals[MAX_LEN / THREAD_DIM];
-    };
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
+#define __OPENCV_GPU_VEC_DISTANCE_HPP__
+
+#include "utility.hpp"
+#include "functional.hpp"
+#include "detail/vec_distance_detail.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template <typename T> struct L1Dist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum = __sad(val1, val2, mySum);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+    template <> struct L1Dist<float>
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            mySum += ::fabs(val1 - val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return mySum;
+        }
+
+        float mySum;
+    };
+
+    struct L2Dist
+    {
+        typedef float value_type;
+        typedef float result_type;
+
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
+        {
+            float reg = val1 - val2;
+            mySum += reg * reg;
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+        }
+
+        __device__ __forceinline__ operator float() const
+        {
+            return sqrtf(mySum);
+        }
+
+        float mySum;
+    };
+
+    struct HammingDist
+    {
+        typedef int value_type;
+        typedef int result_type;
+
+        __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
+        {
+            mySum += __popc(val1 ^ val2);
+        }
+
+        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+        {
+            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+        }
+
+        __device__ __forceinline__ operator int() const
+        {
+            return mySum;
+        }
+
+        int mySum;
+    };
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
+    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        for (int i = tid; i < len; i += THREAD_DIM)
+        {
+            T1 val1;
+            ForceGlob<T1>::Load(vec1, i, val1);
+
+            T2 val2;
+            ForceGlob<T2>::Load(vec2, i, val2);
+
+            dist.reduceIter(val1, val2);
+        }
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    {
+        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+
+        dist.reduceAll<THREAD_DIM>(smem, tid);
+    }
+
+    // calc distance between two vectors in global memory
+    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    {
+        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+        {
+            vec1 = vec1_;
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+        }
+
+        const T1* vec1;
+    };
+
+    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    {
+        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+        {
+            if (glob_tid < len)
+                smem[glob_tid] = vec1[glob_tid];
+            __syncthreads();
+
+            U* vec1ValsPtr = vec1Vals;
+
+            #pragma unroll
+            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+                *vec1ValsPtr++ = smem[i];
+
+            __syncthreads();
+        }
+
+        template <typename T2, typename Dist>
+        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+        {
+            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+        }
+
+        U vec1Vals[MAX_LEN / THREAD_DIM];
+    };
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@@ -1,330 +1,330 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_VECMATH_HPP__
-#define __OPENCV_GPU_VECMATH_HPP__
-
-#include "saturate_cast.hpp"
-#include "vec_traits.hpp"
-#include "functional.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace vec_math_detail
-    {
-        template <int cn, typename VecD> struct SatCastHelper;
-        template <typename VecD> struct SatCastHelper<1, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<2, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<3, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<4, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
-            }
-        };
-
-        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
-        {
-            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
-        }
-    }
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
-
-#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
-    }
-
-    namespace vec_math_detail
-    {
-        template <typename T1, typename T2> struct BinOpTraits
-        {
-            typedef int argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, T>
-        {
-            typedef T argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, double>
-        {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<double, T>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<double, double>
-        {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, float>
-        {
-            typedef float argument_type;
-        };
-        template <typename T> struct BinOpTraits<float, T>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<float, float>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<double, float>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<float, double>
-        {
-            typedef double argument_type;
-        };
-    }
-
-#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
-    } \
-    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
-    { \
-        func<type> f; \
-        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
-    } \
-    template <typename T> \
-    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
-    { \
-        func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \
-        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
-    }
-
-#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator +, plus) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator -, minus) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator *, multiplies) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator /, divides) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator -, negate) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ==, equal_to) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator !=, not_equal_to) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator > , greater) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator < , less) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator >=, greater_equal) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator <=, less_equal) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &&, logical_and) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ||, logical_or) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp10, exp10_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log, log_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log2, log2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log10, log10_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sin, sin_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cos, cos_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tan, tan_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asin, asin_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acos, acos_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atan, atan_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sinh, sinh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cosh, cosh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tanh, tanh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asinh, asinh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acosh, acosh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atanh, atanh_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot, hypot_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, atan2, atan2_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, pow, pow_func) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot_sqr, hypot_sqr_func)
-
-#define OPENCV_GPU_IMPLEMENT_VEC_INT_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &, bit_and) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator |, bit_or) \
-    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
-
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
-
-    #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
-    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VECMATH_HPP__
+#define __OPENCV_GPU_VECMATH_HPP__
+
+#include "saturate_cast.hpp"
+#include "vec_traits.hpp"
+#include "functional.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace vec_math_detail
+    {
+        template <int cn, typename VecD> struct SatCastHelper;
+        template <typename VecD> struct SatCastHelper<1, VecD>
+        {
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<2, VecD>
+        {
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<3, VecD>
+        {
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+            }
+        };
+        template <typename VecD> struct SatCastHelper<4, VecD>
+        {
+            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+            {
+                typedef typename VecTraits<VecD>::elem_type D;
+                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+            }
+        };
+
+        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
+        {
+            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+        }
+    }
+
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return vec_math_detail::saturate_cast_caller<_Tp>(v);}
+
+#define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x)); \
+    } \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x), f(a.y)); \
+    } \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x), f(a.y), f(a.z)); \
+    } \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
+    }
+
+    namespace vec_math_detail
+    {
+        template <typename T1, typename T2> struct BinOpTraits
+        {
+            typedef int argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, T>
+        {
+            typedef T argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<double, T>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<double, double>
+        {
+            typedef double argument_type;
+        };
+        template <typename T> struct BinOpTraits<T, float>
+        {
+            typedef float argument_type;
+        };
+        template <typename T> struct BinOpTraits<float, T>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<float, float>
+        {
+            typedef float argument_type;
+        };
+        template <> struct BinOpTraits<double, float>
+        {
+            typedef double argument_type;
+        };
+        template <> struct BinOpTraits<float, double>
+        {
+            typedef double argument_type;
+        };
+    }
+
+#define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 1>::vec_type>::make(f(a.x, b.x)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(const type ## 1 & v, T s) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(v.x, s)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type op(T s, const type ## 1 & v) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 1>::vec_type>::make(f(s, v.x)); \
+    } \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 2>::vec_type op(const type ## 2 & a, const type ## 2 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 2>::vec_type>::make(f(a.x, b.x), f(a.y, b.y)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(const type ## 2 & v, T s) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(v.x, s), f(v.y, s)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type op(T s, const type ## 2 & v) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 2>::vec_type>::make(f(s, v.x), f(s, v.y)); \
+    } \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 3>::vec_type op(const type ## 3 & a, const type ## 3 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 3>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(const type ## 3 & v, T s) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type op(T s, const type ## 3 & v) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 3>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z)); \
+    } \
+    __device__ __forceinline__ TypeVec<func<type>::result_type, 4>::vec_type op(const type ## 4 & a, const type ## 4 & b) \
+    { \
+        func<type> f; \
+        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(const type ## 4 & v, T s) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<type, T>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(v.x, s), f(v.y, s), f(v.z, s), f(v.w, s)); \
+    } \
+    template <typename T> \
+    __device__ __forceinline__ typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type op(T s, const type ## 4 & v) \
+    { \
+        func<typename vec_math_detail::BinOpTraits<T, type>::argument_type> f; \
+        return VecTraits<typename TypeVec<typename func<typename vec_math_detail::BinOpTraits<type, T>::argument_type>::result_type, 4>::vec_type>::make(f(s, v.x), f(s, v.y), f(s, v.z), f(s, v.w)); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator +, plus) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator -, minus) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator *, multiplies) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator /, divides) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator -, negate) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ==, equal_to) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator !=, not_equal_to) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator > , greater) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator < , less) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator >=, greater_equal) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator <=, less_equal) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &&, logical_and) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ||, logical_or) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp10, exp10_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log, log_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log2, log2_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, log10, log10_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sin, sin_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cos, cos_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tan, tan_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asin, asin_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acos, acos_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atan, atan_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sinh, sinh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, cosh, cosh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, tanh, tanh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, asinh, asinh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, acosh, acosh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, atanh, atanh_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot, hypot_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, atan2, atan2_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, pow, pow_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, hypot_sqr, hypot_sqr_func)
+
+#define OPENCV_GPU_IMPLEMENT_VEC_INT_OP(type) \
+    OPENCV_GPU_IMPLEMENT_VEC_OP(type) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator &, bit_and) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator |, bit_or) \
+    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)
+
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
+    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
+    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
+
+    #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
+    #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
+    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
+    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif // __OPENCV_GPU_VECMATH_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_traits.hpp
@@ -1,280 +1,280 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
-#define __OPENCV_GPU_VEC_TRAITS_HPP__
-
-#include "common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    template<typename T, int N> struct TypeVec;
-
-    struct __align__(8) uchar8
-    {
-        uchar a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
-    {
-        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(8) char8
-    {
-        schar a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
-    {
-        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(16) ushort8
-    {
-        ushort a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
-    {
-        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(16) short8
-    {
-        short a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
-    {
-        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(32) uint8
-    {
-        uint a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
-    {
-        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(32) int8
-    {
-        int a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
-    {
-        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(32) float8
-    {
-        float a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
-    {
-        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct double8
-    {
-        double a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
-    {
-        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-
-#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
-    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
-    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
-    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
-    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
-    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
-    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
-    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
-    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
-    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
-    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
-
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
-
-    #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
-
-    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
-    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
-    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
-    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
-    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
-
-    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
-    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
-    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
-    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
-    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
-
-    template<typename T> struct VecTraits;
-
-#define OPENCV_GPU_IMPLEMENT_VEC_TRAITS(type) \
-    template<> struct VecTraits<type> \
-    { \
-        typedef type elem_type; \
-        enum {cn=1}; \
-        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
-        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
-        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
-    }; \
-    template<> struct VecTraits<type ## 1> \
-    { \
-        typedef type elem_type; \
-        enum {cn=1}; \
-        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
-        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
-        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
-    }; \
-    template<> struct VecTraits<type ## 2> \
-    { \
-        typedef type elem_type; \
-        enum {cn=2}; \
-        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
-        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
-        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
-    }; \
-    template<> struct VecTraits<type ## 3> \
-    { \
-        typedef type elem_type; \
-        enum {cn=3}; \
-        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
-        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
-        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
-    }; \
-    template<> struct VecTraits<type ## 4> \
-    { \
-        typedef type elem_type; \
-        enum {cn=4}; \
-        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
-        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
-        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
-    }; \
-    template<> struct VecTraits<type ## 8> \
-    { \
-        typedef type elem_type; \
-        enum {cn=8}; \
-        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
-        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
-        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
-    };
-
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
-
-    #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
-
-    template<> struct VecTraits<char>
-    {
-        typedef char elem_type;
-        enum {cn=1};
-        static __device__ __host__ __forceinline__ char all(char v) {return v;}
-        static __device__ __host__ __forceinline__ char make(char x) {return x;}
-        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
-    };
-    template<> struct VecTraits<schar>
-    {
-        typedef schar elem_type;
-        enum {cn=1};
-        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
-        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
-        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
-    };
-    template<> struct VecTraits<char1>
-    {
-        typedef schar elem_type;
-        enum {cn=1};
-        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
-        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
-        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
-    };
-    template<> struct VecTraits<char2>
-    {
-        typedef schar elem_type;
-        enum {cn=2};
-        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
-        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
-        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
-    };
-    template<> struct VecTraits<char3>
-    {
-        typedef schar elem_type;
-        enum {cn=3};
-        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
-        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
-        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
-    };
-    template<> struct VecTraits<char4>
-    {
-        typedef schar elem_type;
-        enum {cn=4};
-        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
-        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
-        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
-    };
-    template<> struct VecTraits<char8>
-    {
-        typedef schar elem_type;
-        enum {cn=8};
-        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
-        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
-        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
-    };
-}}} // namespace cv { namespace gpu { namespace device
-
-#endif // __OPENCV_GPU_VEC_TRAITS_HPP__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
+#define __OPENCV_GPU_VEC_TRAITS_HPP__
+
+#include "common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    template<typename T, int N> struct TypeVec;
+
+    struct __align__(8) uchar8
+    {
+        uchar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+    {
+        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(8) char8
+    {
+        schar a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+    {
+        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) ushort8
+    {
+        ushort a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+    {
+        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(16) short8
+    {
+        short a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+    {
+        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) uint8
+    {
+        uint a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+    {
+        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) int8
+    {
+        int a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+    {
+        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct __align__(32) float8
+    {
+        float a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+    {
+        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+    struct double8
+    {
+        double a0, a1, a2, a3, a4, a5, a6, a7;
+    };
+    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+    {
+        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+        return val;
+    }
+
+#define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
+    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
+    template<> struct TypeVec<type ## 1, 1> { typedef type ## 1 vec_type; }; \
+    template<> struct TypeVec<type, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type ## 2, 2> { typedef type ## 2 vec_type; }; \
+    template<> struct TypeVec<type, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type ## 3, 3> { typedef type ## 3 vec_type; }; \
+    template<> struct TypeVec<type, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type ## 4, 4> { typedef type ## 4 vec_type; }; \
+    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
+    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };
+
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
+    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
+
+    #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC
+
+    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
+
+    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
+
+    template<typename T> struct VecTraits;
+
+#define OPENCV_GPU_IMPLEMENT_VEC_TRAITS(type) \
+    template<> struct VecTraits<type> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
+        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
+    }; \
+    template<> struct VecTraits<type ## 1> \
+    { \
+        typedef type elem_type; \
+        enum {cn=1}; \
+        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
+    }; \
+    template<> struct VecTraits<type ## 2> \
+    { \
+        typedef type elem_type; \
+        enum {cn=2}; \
+        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
+    }; \
+    template<> struct VecTraits<type ## 3> \
+    { \
+        typedef type elem_type; \
+        enum {cn=3}; \
+        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
+    }; \
+    template<> struct VecTraits<type ## 4> \
+    { \
+        typedef type elem_type; \
+        enum {cn=4}; \
+        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
+    }; \
+    template<> struct VecTraits<type ## 8> \
+    { \
+        typedef type elem_type; \
+        enum {cn=8}; \
+        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
+    };
+
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
+    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
+
+    #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+
+    template<> struct VecTraits<char>
+    {
+        typedef char elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
+    template<> struct VecTraits<schar>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
+    };
+}}} // namespace cv { namespace gpu { namespace device
+
+#endif // __OPENCV_GPU_VEC_TRAITS_HPP__
--- a/modules/gpu/include/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp.hpp
@@ -1,112 +1,112 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
-#define __OPENCV_GPU_DEVICE_WARP_HPP__
-
-namespace cv { namespace gpu { namespace device
-{
-    struct Warp
-    {
-        enum
-        {
-            LOG_WARP_SIZE = 5,
-            WARP_SIZE     = 1 << LOG_WARP_SIZE,
-            STRIDE        = WARP_SIZE
-        };
-
-        /** \brief Returns the warp lane ID of the calling thread. */
-        static __device__ __forceinline__ unsigned int laneId()
-        {
-            unsigned int ret;
-            asm("mov.u32 %0, %laneid;" : "=r"(ret) );
-            return ret;
-        }
-
-        template<typename It, typename T>
-        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
-        {
-            for(It t = beg + laneId(); t < end; t += STRIDE)
-                *t = value;
-        }
-
-        template<typename InIt, typename OutIt>
-        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
-        {
-            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
-                *out = *t;
-            return out;
-        }
-
-        template<typename InIt, typename OutIt, class UnOp>
-        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
-        {
-            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
-                *out = op(*t);
-            return out;
-        }
-
-        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
-        static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
-        {
-            unsigned int lane = laneId();
-
-            InIt1 t1 = beg1 + lane;
-            InIt2 t2 = beg2 + lane;
-            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
-                *out = op(*t1, *t2);
-            return out;
-        }
-
-        template<typename OutIt, typename T>
-        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
-        {
-            unsigned int lane = laneId();
-            value += lane;
-
-            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
-                *t = value;
-        }
-    };
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
+#define __OPENCV_GPU_DEVICE_WARP_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    struct Warp
+    {
+        enum
+        {
+            LOG_WARP_SIZE = 5,
+            WARP_SIZE     = 1 << LOG_WARP_SIZE,
+            STRIDE        = WARP_SIZE
+        };
+
+        /** \brief Returns the warp lane ID of the calling thread. */
+        static __device__ __forceinline__ unsigned int laneId()
+        {
+            unsigned int ret;
+            asm("mov.u32 %0, %laneid;" : "=r"(ret) );
+            return ret;
+        }
+
+        template<typename It, typename T>
+        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+        {
+            for(It t = beg + laneId(); t < end; t += STRIDE)
+                *t = value;
+        }
+
+        template<typename InIt, typename OutIt>
+        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = *t;
+            return out;
+        }
+
+        template<typename InIt, typename OutIt, class UnOp>
+        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
+        {
+            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+                *out = op(*t);
+            return out;
+        }
+
+        template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+        static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+        {
+            unsigned int lane = laneId();
+
+            InIt1 t1 = beg1 + lane;
+            InIt2 t2 = beg2 + lane;
+            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
+                *out = op(*t1, *t2);
+            return out;
+        }
+
+        template<typename OutIt, typename T>
+        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+        {
+            unsigned int lane = laneId();
+            value += lane;
+
+            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
+                *t = value;
+        }
+    };
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
--- a/modules/gpu/include/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/warp_reduce.hpp
@@ -1,69 +1,69 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-
-#ifndef OPENCV_GPU_WARP_REDUCE_HPP__
-#define OPENCV_GPU_WARP_REDUCE_HPP__
-
-namespace cv { namespace gpu { namespace device
-{
-    template <class T>
-    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
-    {
-        const unsigned int lane = tid & 31; // index of thread in warp (0..31)
-
-        if (lane < 16)
-        {
-            T partial = ptr[tid];
-
-            ptr[tid] = partial = partial + ptr[tid + 16];
-            ptr[tid] = partial = partial + ptr[tid + 8];
-            ptr[tid] = partial = partial + ptr[tid + 4];
-            ptr[tid] = partial = partial + ptr[tid + 2];
-            ptr[tid] = partial = partial + ptr[tid + 1];
-        }
-
-        return ptr[tid - lane];
-    }
-}}} // namespace cv { namespace gpu { namespace device {
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+
+#ifndef OPENCV_GPU_WARP_REDUCE_HPP__
+#define OPENCV_GPU_WARP_REDUCE_HPP__
+
+namespace cv { namespace gpu { namespace device
+{
+    template <class T>
+    __device__ __forceinline__ T warp_reduce(volatile T *ptr , const unsigned int tid = threadIdx.x)
+    {
+        const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+
+        if (lane < 16)
+        {
+            T partial = ptr[tid];
+
+            ptr[tid] = partial = partial + ptr[tid + 16];
+            ptr[tid] = partial = partial + ptr[tid + 8];
+            ptr[tid] = partial = partial + ptr[tid + 4];
+            ptr[tid] = partial = partial + ptr[tid + 2];
+            ptr[tid] = partial = partial + ptr[tid + 1];
+        }
+
+        return ptr[tid - lane];
+    }
+}}} // namespace cv { namespace gpu { namespace device {
+
 #endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -1,43 +1,43 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/core/cuda_devptrs.hpp"
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/cuda_devptrs.hpp"
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
--- a/modules/gpu/include/opencv2/gpu/gpumat.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpumat.hpp
@@ -1,43 +1,43 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/core/gpumat.hpp"
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/core/gpumat.hpp"
--- a/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
+++ b/modules/gpu/include/opencv2/gpu/stream_accessor.hpp
@@ -1,64 +1,64 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
-
-#include "opencv2/gpu/gpu.hpp"
-#include "cuda_runtime_api.h"
-
-namespace cv
-{
-    namespace gpu
-    {
-        // This is only header file that depends on Cuda. All other headers are independent.
-        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
-        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
-        // In this case you have to install Cuda Toolkit.
-        struct StreamAccessor
-        {
-            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
-        };
-    }
-}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+#define __OPENCV_GPU_STREAM_ACCESSOR_HPP__
+
+#include "opencv2/gpu/gpu.hpp"
+#include "cuda_runtime_api.h"
+
+namespace cv
+{
+    namespace gpu
+    {
+        // This is only header file that depends on Cuda. All other headers are independent.
+        // So if you use OpenCV binaries you do noot need to install Cuda Toolkit.
+        // But of you wanna use GPU by yourself, may get cuda stream instance using the class below.
+        // In this case you have to install Cuda Toolkit.
+        struct StreamAccessor
+        {
+            CV_EXPORTS static cudaStream_t getStream(const Stream& stream);
+        };
+    }
+}
+
 #endif /* __OPENCV_GPU_STREAM_ACCESSOR_HPP__ */
--- a/modules/gpu/misc/mark_nvidia.py
+++ b/modules/gpu/misc/mark_nvidia.py
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@@ -1,381 +1,381 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-
-namespace {
-
-//////////////////////////////////////////////////////////////////////
-// StereoBM
-
-typedef std::tr1::tuple<string, string> pair_string;
-DEF_PARAM_TEST_1(ImagePair, pair_string);
-
-PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
-{
-    declare.time(5.0);
-
-    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgLeft.empty());
-
-    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgRight.empty());
-
-    const int preset = 0;
-    const int ndisp = 256;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
-
-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
-
-        d_bm(d_imgLeft, d_imgRight, d_dst);
-
-        TEST_CYCLE()
-        {
-            d_bm(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::StereoBM bm(preset, ndisp);
-
-        cv::Mat dst;
-
-        bm(imgLeft, imgRight, dst);
-
-        TEST_CYCLE()
-        {
-            bm(imgLeft, imgRight, dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// StereoBeliefPropagation
-
-PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
-{
-    declare.time(10.0);
-
-    const cv::Mat imgLeft = readImage(GET_PARAM(0));
-    ASSERT_FALSE(imgLeft.empty());
-
-    const cv::Mat imgRight = readImage(GET_PARAM(1));
-    ASSERT_FALSE(imgRight.empty());
-
-    const int ndisp = 64;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
-
-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
-
-        d_bp(d_imgLeft, d_imgRight, d_dst);
-
-        TEST_CYCLE()
-        {
-            d_bp(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy.";
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// StereoConstantSpaceBP
-
-PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
-{
-    declare.time(10.0);
-
-    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgLeft.empty());
-
-    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(imgRight.empty());
-
-    const int ndisp = 128;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
-
-        cv::gpu::GpuMat d_imgLeft(imgLeft);
-        cv::gpu::GpuMat d_imgRight(imgRight);
-        cv::gpu::GpuMat d_dst;
-
-        d_csbp(d_imgLeft, d_imgRight, d_dst);
-
-        TEST_CYCLE()
-        {
-            d_csbp(d_imgLeft, d_imgRight, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy.";
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// DisparityBilateralFilter
-
-PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
-{
-    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    const cv::Mat disp = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(disp.empty());
-
-    const int ndisp = 128;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
-
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_disp(disp);
-        cv::gpu::GpuMat d_dst;
-
-        d_filter(d_disp, d_img, d_dst);
-
-        TEST_CYCLE()
-        {
-            d_filter(d_disp, d_img, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy.";
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// TransformPoints
-
-DEF_PARAM_TEST_1(Count, int);
-
-PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
-{
-    const int count = GetParam();
-
-    cv::Mat src(1, count, CV_32FC3);
-    fillRandom(src, -100, 100);
-
-    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy.";
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ProjectPoints
-
-PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
-{
-    const int count = GetParam();
-
-    cv::Mat src(1, count, CV_32FC3);
-    fillRandom(src, -100, 100);
-
-    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
-    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
-    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-
-        TEST_CYCLE()
-        {
-            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// SolvePnPRansac
-
-PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
-{
-    declare.time(10.0);
-
-    const int count = GetParam();
-
-    cv::Mat object(1, count, CV_32FC3);
-    fillRandom(object, -100, 100);
-
-    cv::Mat camera_mat(3, 3, CV_32FC1);
-    fillRandom(camera_mat, 0.5, 1);
-    camera_mat.at<float>(0, 1) = 0.f;
-    camera_mat.at<float>(1, 0) = 0.f;
-    camera_mat.at<float>(2, 0) = 0.f;
-    camera_mat.at<float>(2, 1) = 0.f;
-
-    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
-
-    std::vector<cv::Point2f> image_vec;
-    cv::Mat rvec_gold(1, 3, CV_32FC1);
-    fillRandom(rvec_gold, 0, 1);
-    cv::Mat tvec_gold(1, 3, CV_32FC1);
-    fillRandom(tvec_gold, 0, 1);
-    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
-
-    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
-
-    cv::Mat rvec;
-    cv::Mat tvec;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-        }
-    }
-    else
-    {
-        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-
-        TEST_CYCLE()
-        {
-            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
-        }
-    }
-
-    CPU_SANITY_CHECK(rvec);
-    CPU_SANITY_CHECK(tvec);
-}
-
-//////////////////////////////////////////////////////////////////////
-// ReprojectImageTo3D
-
-PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int depth = GET_PARAM(1);
-
-    cv::Mat src(size, depth);
-    fillRandom(src, 5.0, 30.0);
-
-    cv::Mat Q(4, 4, CV_32FC1);
-    fillRandom(Q, 0.1, 1.0);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::reprojectImageTo3D(src, dst, Q);
-
-        TEST_CYCLE()
-        {
-            cv::reprojectImageTo3D(src, dst, Q);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// DrawColorDisp
-
-PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
-{
-    const cv::Size size = GET_PARAM(0);
-    const int type = GET_PARAM(1);
-
-    cv::Mat src(size, type);
-    fillRandom(src, 0, 255);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::drawColorDisp(d_src, d_dst, 255);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::drawColorDisp(d_src, d_dst, 255);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        FAIL() << "No such CPU implementation analogy.";
-    }
-}
-
-} // namespace
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+
+namespace {
+
+//////////////////////////////////////////////////////////////////////
+// StereoBM
+
+typedef std::tr1::tuple<string, string> pair_string;
+DEF_PARAM_TEST_1(ImagePair, pair_string);
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBM, Values(pair_string("gpu/perf/aloe.png", "gpu/perf/aloeR.png")))
+{
+    declare.time(5.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int preset = 0;
+    const int ndisp = 256;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::StereoBM_GPU d_bm(preset, ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bm(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_bm(d_imgLeft, d_imgRight, d_dst);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::StereoBM bm(preset, ndisp);
+
+        cv::Mat dst;
+
+        bm(imgLeft, imgRight, dst);
+
+        TEST_CYCLE()
+        {
+            bm(imgLeft, imgRight, dst);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// StereoBeliefPropagation
+
+PERF_TEST_P(ImagePair, Calib3D_StereoBeliefPropagation, Values(pair_string("gpu/stereobp/aloe-L.png", "gpu/stereobp/aloe-R.png")))
+{
+    declare.time(10.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0));
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1));
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 64;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::StereoBeliefPropagation d_bp(ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_bp(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_bp(d_imgLeft, d_imgRight, d_dst);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        FAIL() << "No such CPU implementation analogy.";
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// StereoConstantSpaceBP
+
+PERF_TEST_P(ImagePair, Calib3D_StereoConstantSpaceBP, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-R.png")))
+{
+    declare.time(10.0);
+
+    const cv::Mat imgLeft = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgLeft.empty());
+
+    const cv::Mat imgRight = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(imgRight.empty());
+
+    const int ndisp = 128;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::StereoConstantSpaceBP d_csbp(ndisp);
+
+        cv::gpu::GpuMat d_imgLeft(imgLeft);
+        cv::gpu::GpuMat d_imgRight(imgRight);
+        cv::gpu::GpuMat d_dst;
+
+        d_csbp(d_imgLeft, d_imgRight, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_csbp(d_imgLeft, d_imgRight, d_dst);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        FAIL() << "No such CPU implementation analogy.";
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DisparityBilateralFilter
+
+PERF_TEST_P(ImagePair, Calib3D_DisparityBilateralFilter, Values(pair_string("gpu/stereobm/aloe-L.png", "gpu/stereobm/aloe-disp.png")))
+{
+    const cv::Mat img = readImage(GET_PARAM(0), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    const cv::Mat disp = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(disp.empty());
+
+    const int ndisp = 128;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::DisparityBilateralFilter d_filter(ndisp);
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_disp(disp);
+        cv::gpu::GpuMat d_dst;
+
+        d_filter(d_disp, d_img, d_dst);
+
+        TEST_CYCLE()
+        {
+            d_filter(d_disp, d_img, d_dst);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        FAIL() << "No such CPU implementation analogy.";
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// TransformPoints
+
+DEF_PARAM_TEST_1(Count, int);
+
+PERF_TEST_P(Count, Calib3D_TransformPoints, Values(5000, 10000, 20000))
+{
+    const int count = GetParam();
+
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);
+
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::transformPoints(d_src, rvec, tvec, d_dst);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        FAIL() << "No such CPU implementation analogy.";
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ProjectPoints
+
+PERF_TEST_P(Count, Calib3D_ProjectPoints, Values(5000, 10000, 20000))
+{
+    const int count = GetParam();
+
+    cv::Mat src(1, count, CV_32FC3);
+    fillRandom(src, -100, 100);
+
+    const cv::Mat rvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat tvec = cv::Mat::ones(1, 3, CV_32FC1);
+    const cv::Mat camera_mat = cv::Mat::ones(3, 3, CV_32FC1);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::projectPoints(d_src, rvec, tvec, camera_mat, cv::Mat(), d_dst);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+
+        TEST_CYCLE()
+        {
+            cv::projectPoints(src, rvec, tvec, camera_mat, cv::noArray(), dst);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SolvePnPRansac
+
+PERF_TEST_P(Count, Calib3D_SolvePnPRansac, Values(5000, 10000, 20000))
+{
+    declare.time(10.0);
+
+    const int count = GetParam();
+
+    cv::Mat object(1, count, CV_32FC3);
+    fillRandom(object, -100, 100);
+
+    cv::Mat camera_mat(3, 3, CV_32FC1);
+    fillRandom(camera_mat, 0.5, 1);
+    camera_mat.at<float>(0, 1) = 0.f;
+    camera_mat.at<float>(1, 0) = 0.f;
+    camera_mat.at<float>(2, 0) = 0.f;
+    camera_mat.at<float>(2, 1) = 0.f;
+
+    const cv::Mat dist_coef(1, 8, CV_32F, cv::Scalar::all(0));
+
+    std::vector<cv::Point2f> image_vec;
+    cv::Mat rvec_gold(1, 3, CV_32FC1);
+    fillRandom(rvec_gold, 0, 1);
+    cv::Mat tvec_gold(1, 3, CV_32FC1);
+    fillRandom(tvec_gold, 0, 1);
+    cv::projectPoints(object, rvec_gold, tvec_gold, camera_mat, dist_coef, image_vec);
+
+    cv::Mat image(1, count, CV_32FC2, &image_vec[0]);
+
+    cv::Mat rvec;
+    cv::Mat tvec;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
+    }
+    else
+    {
+        cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+
+        TEST_CYCLE()
+        {
+            cv::solvePnPRansac(object, image, camera_mat, dist_coef, rvec, tvec);
+        }
+    }
+
+    CPU_SANITY_CHECK(rvec);
+    CPU_SANITY_CHECK(tvec);
+}
+
+//////////////////////////////////////////////////////////////////////
+// ReprojectImageTo3D
+
+PERF_TEST_P(Sz_Depth, Calib3D_ReprojectImageTo3D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int depth = GET_PARAM(1);
+
+    cv::Mat src(size, depth);
+    fillRandom(src, 5.0, 30.0);
+
+    cv::Mat Q(4, 4, CV_32FC1);
+    fillRandom(Q, 0.1, 1.0);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::reprojectImageTo3D(d_src, d_dst, Q);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::reprojectImageTo3D(src, dst, Q);
+
+        TEST_CYCLE()
+        {
+            cv::reprojectImageTo3D(src, dst, Q);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// DrawColorDisp
+
+PERF_TEST_P(Sz_Depth, Calib3D_DrawColorDisp, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16S)))
+{
+    const cv::Size size = GET_PARAM(0);
+    const int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    fillRandom(src, 0, 255);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::drawColorDisp(d_src, d_dst, 255);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::drawColorDisp(d_src, d_dst, 255);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        FAIL() << "No such CPU implementation analogy.";
+    }
+}
+
+} // namespace
--- a/modules/gpu/perf/perf_core.cpp
+++ b/modules/gpu/perf/perf_core.cpp
--- a/modules/gpu/perf/perf_features2d.cpp
+++ b/modules/gpu/perf/perf_features2d.cpp
@@ -1,309 +1,309 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-
-namespace {
-
-//////////////////////////////////////////////////////////////////////
-// SURF
-
-DEF_PARAM_TEST_1(Image, string);
-
-PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.png"))
-{
-    declare.time(50.0);
-
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::SURF_GPU d_surf;
-
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_keypoints, d_descriptors;
-
-        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-
-        TEST_CYCLE()
-        {
-            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-        }
-
-        GPU_SANITY_CHECK(d_descriptors, 1e-4);
-        GPU_SANITY_CHECK_KEYPOINTS(SURF, d_keypoints);
-    }
-    else
-    {
-        cv::SURF surf;
-
-        std::vector<cv::KeyPoint> keypoints;
-        cv::Mat descriptors;
-
-        surf(img, cv::noArray(), keypoints, descriptors);
-
-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            surf(img, cv::noArray(), keypoints, descriptors);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
-        SANITY_CHECK(descriptors, 1e-4);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// FAST
-
-PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::FAST_GPU d_fast(20);
-
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_keypoints;
-
-        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
-
-        TEST_CYCLE()
-        {
-            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
-        }
-
-        GPU_SANITY_CHECK_RESPONSE(FAST, d_keypoints);
-    }
-    else
-    {
-        std::vector<cv::KeyPoint> keypoints;
-
-        cv::FAST(img, keypoints, 20);
-
-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            cv::FAST(img, keypoints, 20);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ORB
-
-PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::ORB_GPU d_orb(4000);
-
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_keypoints, d_descriptors;
-
-        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-
-        TEST_CYCLE()
-        {
-            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
-        }
-
-        GPU_SANITY_CHECK_KEYPOINTS(ORB, d_keypoints);
-        GPU_SANITY_CHECK(d_descriptors);
-    }
-    else
-    {
-        cv::ORB orb(4000);
-
-        std::vector<cv::KeyPoint> keypoints;
-        cv::Mat descriptors;
-
-        orb(img, cv::noArray(), keypoints, descriptors);
-
-        TEST_CYCLE()
-        {
-            keypoints.clear();
-            orb(img, cv::noArray(), keypoints, descriptors);
-        }
-
-        SANITY_CHECK_KEYPOINTS(keypoints);
-        SANITY_CHECK(descriptors);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BFMatch
-
-DEF_PARAM_TEST(DescSize_Norm, int, NormType);
-
-PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
-{
-    declare.time(20.0);
-
-    int desc_size = GET_PARAM(0);
-    int normType = GET_PARAM(1);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fillRandom(query);
-
-    cv::Mat train(3000, desc_size, type);
-    fillRandom(train);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::BFMatcher_GPU d_matcher(normType);
-
-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
-        cv::gpu::GpuMat d_trainIdx, d_distance;
-
-        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-
-        TEST_CYCLE()
-        {
-            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
-        }
-
-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
-    }
-    else
-    {
-        cv::BFMatcher matcher(normType);
-
-        std::vector<cv::DMatch> matches;
-
-        matcher.match(query, train, matches);
-
-        TEST_CYCLE()
-        {
-            matcher.match(query, train, matches);
-        }
-
-        SANITY_CHECK(matches);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BFKnnMatch
-
-DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);
-
-PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
-    Values(64, 128, 256),
-    Values(2, 3),
-    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
-{
-    declare.time(30.0);
-
-    int desc_size = GET_PARAM(0);
-    int k = GET_PARAM(1);
-    int normType = GET_PARAM(2);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fillRandom(query);
-
-    cv::Mat train(3000, desc_size, type);
-    fillRandom(train);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::BFMatcher_GPU d_matcher(normType);
-
-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
-        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;
-
-        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
-
-        TEST_CYCLE()
-        {
-            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
-        }
-
-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
-    }
-    else
-    {
-        cv::BFMatcher matcher(normType);
-
-        std::vector< std::vector<cv::DMatch> > matches;
-
-        matcher.knnMatch(query, train, matches, k);
-
-        TEST_CYCLE()
-        {
-            matcher.knnMatch(query, train, matches, k);
-        }
-
-        SANITY_CHECK(matches);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// BFRadiusMatch
-
-PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
-{
-    declare.time(30.0);
-
-    int desc_size = GET_PARAM(0);
-    int normType = GET_PARAM(1);
-
-    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
-
-    cv::Mat query(3000, desc_size, type);
-    fillRandom(query, 0.0, 1.0);
-
-    cv::Mat train(3000, desc_size, type);
-    fillRandom(train, 0.0, 1.0);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::BFMatcher_GPU d_matcher(normType);
-
-        cv::gpu::GpuMat d_query(query);
-        cv::gpu::GpuMat d_train(train);
-        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;
-
-        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
-
-        TEST_CYCLE()
-        {
-            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
-        }
-
-        GPU_SANITY_CHECK(d_trainIdx);
-        GPU_SANITY_CHECK(d_distance);
-    }
-    else
-    {
-        cv::BFMatcher matcher(normType);
-
-        std::vector< std::vector<cv::DMatch> > matches;
-
-        matcher.radiusMatch(query, train, matches, 2.0);
-
-        TEST_CYCLE()
-        {
-            matcher.radiusMatch(query, train, matches, 2.0);
-        }
-
-        SANITY_CHECK(matches);
-    }
-}
-
-} // namespace
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+
+namespace {
+
+//////////////////////////////////////////////////////////////////////
+// SURF
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, Features2D_SURF, Values<string>("gpu/perf/aloe.png"))
+{
+    declare.time(50.0);
+
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::SURF_GPU d_surf;
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;
+
+        d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        TEST_CYCLE()
+        {
+            d_surf(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+
+        GPU_SANITY_CHECK(d_descriptors, 1e-4);
+        GPU_SANITY_CHECK_KEYPOINTS(SURF, d_keypoints);
+    }
+    else
+    {
+        cv::SURF surf;
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        surf(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            surf(img, cv::noArray(), keypoints, descriptors);
+        }
+
+        SANITY_CHECK_KEYPOINTS(keypoints);
+        SANITY_CHECK(descriptors, 1e-4);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// FAST
+
+PERF_TEST_P(Image, Features2D_FAST, Values<string>("gpu/perf/aloe.png"))
+{
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::FAST_GPU d_fast(20);
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints;
+
+        d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+
+        TEST_CYCLE()
+        {
+            d_fast(d_img, cv::gpu::GpuMat(), d_keypoints);
+        }
+
+        GPU_SANITY_CHECK_RESPONSE(FAST, d_keypoints);
+    }
+    else
+    {
+        std::vector<cv::KeyPoint> keypoints;
+
+        cv::FAST(img, keypoints, 20);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            cv::FAST(img, keypoints, 20);
+        }
+
+        SANITY_CHECK_KEYPOINTS(keypoints);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ORB
+
+PERF_TEST_P(Image, Features2D_ORB, Values<string>("gpu/perf/aloe.png"))
+{
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::ORB_GPU d_orb(4000);
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_keypoints, d_descriptors;
+
+        d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+
+        TEST_CYCLE()
+        {
+            d_orb(d_img, cv::gpu::GpuMat(), d_keypoints, d_descriptors);
+        }
+
+        GPU_SANITY_CHECK_KEYPOINTS(ORB, d_keypoints);
+        GPU_SANITY_CHECK(d_descriptors);
+    }
+    else
+    {
+        cv::ORB orb(4000);
+
+        std::vector<cv::KeyPoint> keypoints;
+        cv::Mat descriptors;
+
+        orb(img, cv::noArray(), keypoints, descriptors);
+
+        TEST_CYCLE()
+        {
+            keypoints.clear();
+            orb(img, cv::noArray(), keypoints, descriptors);
+        }
+
+        SANITY_CHECK_KEYPOINTS(keypoints);
+        SANITY_CHECK(descriptors);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFMatch
+
+DEF_PARAM_TEST(DescSize_Norm, int, NormType);
+
+PERF_TEST_P(DescSize_Norm, Features2D_BFMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(20.0);
+
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);
+
+    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);
+
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance;
+
+        d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+
+        TEST_CYCLE()
+        {
+            d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
+        }
+
+        GPU_SANITY_CHECK(d_trainIdx);
+        GPU_SANITY_CHECK(d_distance);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector<cv::DMatch> matches;
+
+        matcher.match(query, train, matches);
+
+        TEST_CYCLE()
+        {
+            matcher.match(query, train, matches);
+        }
+
+        SANITY_CHECK(matches);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFKnnMatch
+
+DEF_PARAM_TEST(DescSize_K_Norm, int, int, NormType);
+
+PERF_TEST_P(DescSize_K_Norm, Features2D_BFKnnMatch, Combine(
+    Values(64, 128, 256),
+    Values(2, 3),
+    Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(30.0);
+
+    int desc_size = GET_PARAM(0);
+    int k = GET_PARAM(1);
+    int normType = GET_PARAM(2);
+
+    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query);
+
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_distance, d_allDist;
+
+        d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+
+        TEST_CYCLE()
+        {
+            d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, k);
+        }
+
+        GPU_SANITY_CHECK(d_trainIdx);
+        GPU_SANITY_CHECK(d_distance);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.knnMatch(query, train, matches, k);
+
+        TEST_CYCLE()
+        {
+            matcher.knnMatch(query, train, matches, k);
+        }
+
+        SANITY_CHECK(matches);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// BFRadiusMatch
+
+PERF_TEST_P(DescSize_Norm, Features2D_BFRadiusMatch, Combine(Values(64, 128, 256), Values(NormType(cv::NORM_L1), NormType(cv::NORM_L2), NormType(cv::NORM_HAMMING))))
+{
+    declare.time(30.0);
+
+    int desc_size = GET_PARAM(0);
+    int normType = GET_PARAM(1);
+
+    int type = normType == cv::NORM_HAMMING ? CV_8U : CV_32F;
+
+    cv::Mat query(3000, desc_size, type);
+    fillRandom(query, 0.0, 1.0);
+
+    cv::Mat train(3000, desc_size, type);
+    fillRandom(train, 0.0, 1.0);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::BFMatcher_GPU d_matcher(normType);
+
+        cv::gpu::GpuMat d_query(query);
+        cv::gpu::GpuMat d_train(train);
+        cv::gpu::GpuMat d_trainIdx, d_nMatches, d_distance;
+
+        d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+
+        TEST_CYCLE()
+        {
+            d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, 2.0);
+        }
+
+        GPU_SANITY_CHECK(d_trainIdx);
+        GPU_SANITY_CHECK(d_distance);
+    }
+    else
+    {
+        cv::BFMatcher matcher(normType);
+
+        std::vector< std::vector<cv::DMatch> > matches;
+
+        matcher.radiusMatch(query, train, matches, 2.0);
+
+        TEST_CYCLE()
+        {
+            matcher.radiusMatch(query, train, matches, 2.0);
+        }
+
+        SANITY_CHECK(matches);
+    }
+}
+
+} // namespace
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@@ -1,415 +1,415 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-
-namespace {
-
-//////////////////////////////////////////////////////////////////////
-// Blur
-
-DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);
-
-PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
-
-        TEST_CYCLE()
-        {
-            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::blur(src, dst, cv::Size(ksize, ksize));
-
-        TEST_CYCLE()
-        {
-            cv::blur(src, dst, cv::Size(ksize, ksize));
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Sobel
-
-PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-        cv::gpu::GpuMat d_buf;
-
-        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::Sobel(src, dst, -1, 1, 1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::Sobel(src, dst, -1, 1, 1, ksize);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Scharr
-
-PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-        cv::gpu::GpuMat d_buf;
-
-        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::Scharr(src, dst, -1, 1, 0);
-
-        TEST_CYCLE()
-        {
-            cv::Scharr(src, dst, -1, 1, 0);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// GaussianBlur
-
-PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-        cv::gpu::GpuMat d_buf;
-
-        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-
-        TEST_CYCLE()
-        {
-            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Laplacian
-
-PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::Laplacian(src, dst, -1, ksize);
-
-        TEST_CYCLE()
-        {
-            cv::Laplacian(src, dst, -1, ksize);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Erode
-
-PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-        cv::gpu::GpuMat d_buf;
-
-        cv::gpu::erode(d_src, d_dst, ker, d_buf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::erode(d_src, d_dst, ker, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::erode(src, dst, ker);
-
-        TEST_CYCLE()
-        {
-            cv::erode(src, dst, ker);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Dilate
-
-PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-        cv::gpu::GpuMat d_buf;
-
-        cv::gpu::dilate(d_src, d_dst, ker, d_buf);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::dilate(src, dst, ker);
-
-        TEST_CYCLE()
-        {
-            cv::dilate(src, dst, ker);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// MorphologyEx
-
-CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
-#define ALL_MORPH_OPS ValuesIn(MorphOp::all())
-
-DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
-
-PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), ALL_MORPH_OPS))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int morphOp = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-        cv::gpu::GpuMat d_buf1;
-        cv::gpu::GpuMat d_buf2;
-
-        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::morphologyEx(src, dst, morphOp, ker);
-
-        TEST_CYCLE()
-        {
-            cv::morphologyEx(src, dst, morphOp, ker);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// Filter2D
-
-PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
-{
-    declare.time(20.0);
-
-    cv::Size size = GET_PARAM(0);
-    int type = GET_PARAM(1);
-    int ksize = GET_PARAM(2);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    cv::Mat kernel(ksize, ksize, CV_32FC1);
-    fillRandom(kernel, 0.0, 1.0);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        cv::gpu::filter2D(d_src, d_dst, -1, kernel);
-
-        TEST_CYCLE()
-        {
-            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        cv::filter2D(src, dst, -1, kernel);
-
-        TEST_CYCLE()
-        {
-            cv::filter2D(src, dst, -1, kernel);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-} // namespace
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+
+namespace {
+
+//////////////////////////////////////////////////////////////////////
+// Blur
+
+DEF_PARAM_TEST(Sz_Type_KernelSz, cv::Size, MatType, int);
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Blur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), Values(3, 5, 7)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+
+        TEST_CYCLE()
+        {
+            cv::gpu::blur(d_src, d_dst, cv::Size(ksize, ksize));
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::blur(src, dst, cv::Size(ksize, ksize));
+
+        TEST_CYCLE()
+        {
+            cv::blur(src, dst, cv::Size(ksize, ksize));
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Sobel
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Sobel, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::Sobel(d_src, d_dst, -1, 1, 1, d_buf, ksize);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Sobel(src, dst, -1, 1, 1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Sobel(src, dst, -1, 1, 1, ksize);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Scharr
+
+PERF_TEST_P(Sz_Type, Filters_Scharr, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::Scharr(d_src, d_dst, -1, 1, 0, d_buf);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Scharr(src, dst, -1, 1, 0);
+
+        TEST_CYCLE()
+        {
+            cv::Scharr(src, dst, -1, 1, 0);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// GaussianBlur
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_GaussianBlur, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::GaussianBlur(d_src, d_dst, cv::Size(ksize, ksize), d_buf, 0.5);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+
+        TEST_CYCLE()
+        {
+            cv::GaussianBlur(src, dst, cv::Size(ksize, ksize), 0.5);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Laplacian
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Laplacian, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 3)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::Laplacian(d_src, d_dst, -1, ksize);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::Laplacian(src, dst, -1, ksize);
+
+        TEST_CYCLE()
+        {
+            cv::Laplacian(src, dst, -1, ksize);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Erode
+
+PERF_TEST_P(Sz_Type, Filters_Erode, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::erode(d_src, d_dst, ker, d_buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::erode(d_src, d_dst, ker, d_buf);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::erode(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::erode(src, dst, ker);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Dilate
+
+PERF_TEST_P(Sz_Type, Filters_Dilate, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf;
+
+        cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::dilate(d_src, d_dst, ker, d_buf);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::dilate(src, dst, ker);
+
+        TEST_CYCLE()
+        {
+            cv::dilate(src, dst, ker);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// MorphologyEx
+
+CV_ENUM(MorphOp, cv::MORPH_OPEN, cv::MORPH_CLOSE, cv::MORPH_GRADIENT, cv::MORPH_TOPHAT, cv::MORPH_BLACKHAT)
+#define ALL_MORPH_OPS ValuesIn(MorphOp::all())
+
+DEF_PARAM_TEST(Sz_Type_Op, cv::Size, MatType, MorphOp);
+
+PERF_TEST_P(Sz_Type_Op, Filters_MorphologyEx, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4), ALL_MORPH_OPS))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int morphOp = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    cv::Mat ker = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(3, 3));
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+        cv::gpu::GpuMat d_buf1;
+        cv::gpu::GpuMat d_buf2;
+
+        cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::morphologyEx(d_src, d_dst, morphOp, ker, d_buf1, d_buf2);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::morphologyEx(src, dst, morphOp, ker);
+
+        TEST_CYCLE()
+        {
+            cv::morphologyEx(src, dst, morphOp, ker);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// Filter2D
+
+PERF_TEST_P(Sz_Type_KernelSz, Filters_Filter2D, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(3, 5, 7, 9, 11, 13, 15)))
+{
+    declare.time(20.0);
+
+    cv::Size size = GET_PARAM(0);
+    int type = GET_PARAM(1);
+    int ksize = GET_PARAM(2);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    cv::Mat kernel(ksize, ksize, CV_32FC1);
+    fillRandom(kernel, 0.0, 1.0);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+
+        TEST_CYCLE()
+        {
+            cv::gpu::filter2D(d_src, d_dst, -1, kernel);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        cv::filter2D(src, dst, -1, kernel);
+
+        TEST_CYCLE()
+        {
+            cv::filter2D(src, dst, -1, kernel);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+} // namespace
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
--- a/modules/gpu/perf/perf_labeling.cpp
+++ b/modules/gpu/perf/perf_labeling.cpp
@@ -32,8 +32,8 @@ struct GreedyLabeling
            return lo <= d && d <= hi;
        }

-	private:
-		InInterval& operator=(const InInterval&);
+    private:
+        InInterval& operator=(const InInterval&);


    };
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@@ -1,185 +1,185 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-
-namespace {
-
-//////////////////////////////////////////////////////////////////////
-// SetTo
-
-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
-{
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
-
-    int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Scalar val(1, 2, 3, 4);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(size, type);
-
-        d_src.setTo(val);
-
-        TEST_CYCLE()
-        {
-            d_src.setTo(val);
-        }
-
-        GPU_SANITY_CHECK(d_src);
-    }
-    else
-    {
-        cv::Mat src(size, type);
-
-        src.setTo(val);
-
-        TEST_CYCLE()
-        {
-            src.setTo(val);
-        }
-
-        CPU_SANITY_CHECK(src);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// SetToMasked
-
-PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
-{
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
-
-    int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
-
-    cv::Scalar val(1, 2, 3, 4);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
-
-        d_src.setTo(val, d_mask);
-
-        TEST_CYCLE()
-        {
-            d_src.setTo(val, d_mask);
-        }
-
-        GPU_SANITY_CHECK(d_src);
-    }
-    else
-    {
-        src.setTo(val, mask);
-
-        TEST_CYCLE()
-        {
-            src.setTo(val, mask);
-        }
-
-        CPU_SANITY_CHECK(src);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// CopyToMasked
-
-PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
-{
-    cv::Size size = GET_PARAM(0);
-    int depth = GET_PARAM(1);
-    int channels = GET_PARAM(2);
-
-    int type = CV_MAKE_TYPE(depth, channels);
-
-    cv::Mat src(size, type);
-    fillRandom(src);
-
-    cv::Mat mask(size, CV_8UC1);
-    fillRandom(mask, 0, 2);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_mask(mask);
-        cv::gpu::GpuMat d_dst;
-
-        d_src.copyTo(d_dst, d_mask);
-
-        TEST_CYCLE()
-        {
-            d_src.copyTo(d_dst, d_mask);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        src.copyTo(dst, mask);
-
-        TEST_CYCLE()
-        {
-            src.copyTo(dst, mask);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-//////////////////////////////////////////////////////////////////////
-// ConvertTo
-
-DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
-
-PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
-{
-    cv::Size size = GET_PARAM(0);
-    int depth1 = GET_PARAM(1);
-    int depth2 = GET_PARAM(2);
-
-    cv::Mat src(size, depth1);
-    fillRandom(src);
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_src(src);
-        cv::gpu::GpuMat d_dst;
-
-        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
-
-        TEST_CYCLE()
-        {
-            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
-        }
-
-        GPU_SANITY_CHECK(d_dst);
-    }
-    else
-    {
-        cv::Mat dst;
-
-        src.convertTo(dst, depth2, 0.5, 1.0);
-
-        TEST_CYCLE()
-        {
-            src.convertTo(dst, depth2, 0.5, 1.0);
-        }
-
-        CPU_SANITY_CHECK(dst);
-    }
-}
-
-} // namespace
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+
+namespace {
+
+//////////////////////////////////////////////////////////////////////
+// SetTo
+
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+{
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Scalar val(1, 2, 3, 4);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(size, type);
+
+        d_src.setTo(val);
+
+        TEST_CYCLE()
+        {
+            d_src.setTo(val);
+        }
+
+        GPU_SANITY_CHECK(d_src);
+    }
+    else
+    {
+        cv::Mat src(size, type);
+
+        src.setTo(val);
+
+        TEST_CYCLE()
+        {
+            src.setTo(val);
+        }
+
+        CPU_SANITY_CHECK(src);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// SetToMasked
+
+PERF_TEST_P(Sz_Depth_Cn, MatOp_SetToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+{
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);
+
+    cv::Scalar val(1, 2, 3, 4);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+
+        d_src.setTo(val, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_src.setTo(val, d_mask);
+        }
+
+        GPU_SANITY_CHECK(d_src);
+    }
+    else
+    {
+        src.setTo(val, mask);
+
+        TEST_CYCLE()
+        {
+            src.setTo(val, mask);
+        }
+
+        CPU_SANITY_CHECK(src);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// CopyToMasked
+
+PERF_TEST_P(Sz_Depth_Cn, MatOp_CopyToMasked, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), GPU_CHANNELS_1_3_4))
+{
+    cv::Size size = GET_PARAM(0);
+    int depth = GET_PARAM(1);
+    int channels = GET_PARAM(2);
+
+    int type = CV_MAKE_TYPE(depth, channels);
+
+    cv::Mat src(size, type);
+    fillRandom(src);
+
+    cv::Mat mask(size, CV_8UC1);
+    fillRandom(mask, 0, 2);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_mask(mask);
+        cv::gpu::GpuMat d_dst;
+
+        d_src.copyTo(d_dst, d_mask);
+
+        TEST_CYCLE()
+        {
+            d_src.copyTo(d_dst, d_mask);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        src.copyTo(dst, mask);
+
+        TEST_CYCLE()
+        {
+            src.copyTo(dst, mask);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////
+// ConvertTo
+
+DEF_PARAM_TEST(Sz_2Depth, cv::Size, MatDepth, MatDepth);
+
+PERF_TEST_P(Sz_2Depth, MatOp_ConvertTo, Combine(GPU_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F, CV_64F), Values(CV_8U, CV_16U, CV_32F, CV_64F)))
+{
+    cv::Size size = GET_PARAM(0);
+    int depth1 = GET_PARAM(1);
+    int depth2 = GET_PARAM(2);
+
+    cv::Mat src(size, depth1);
+    fillRandom(src);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_src(src);
+        cv::gpu::GpuMat d_dst;
+
+        d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+
+        TEST_CYCLE()
+        {
+            d_src.convertTo(d_dst, depth2, 0.5, 1.0);
+        }
+
+        GPU_SANITY_CHECK(d_dst);
+    }
+    else
+    {
+        cv::Mat dst;
+
+        src.convertTo(dst, depth2, 0.5, 1.0);
+
+        TEST_CYCLE()
+        {
+            src.convertTo(dst, depth2, 0.5, 1.0);
+        }
+
+        CPU_SANITY_CHECK(dst);
+    }
+}
+
+} // namespace
--- a/modules/gpu/perf/perf_objdetect.cpp
+++ b/modules/gpu/perf/perf_objdetect.cpp
@@ -1,184 +1,184 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace testing;
-
-namespace {
-
-///////////////////////////////////////////////////////////////
-// HOG
-
-DEF_PARAM_TEST_1(Image, string);
-
-PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_img(img);
-
-        cv::gpu::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        d_hog.detectMultiScale(d_img, found_locations);
-
-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
-    }
-    else
-    {
-        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        hog.detectMultiScale(img, found_locations);
-
-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
-    }
-
-    SANITY_CHECK(found_locations);
-}
-
-//===========test for CalTech data =============//
-DEF_PARAM_TEST_1(HOG, string);
-
-PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gpu/caltech/image_00000032_0.png",
-    "gpu/caltech/image_00000165_0.png", "gpu/caltech/image_00000261_0.png", "gpu/caltech/image_00000469_0.png",
-    "gpu/caltech/image_00000527_0.png", "gpu/caltech/image_00000574_0.png"))
-{
-    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    std::vector<cv::Rect> found_locations;
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::GpuMat d_img(img);
-
-        cv::gpu::HOGDescriptor d_hog;
-        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        d_hog.detectMultiScale(d_img, found_locations);
-
-        TEST_CYCLE()
-        {
-            d_hog.detectMultiScale(d_img, found_locations);
-        }
-    }
-    else
-    {
-        cv::HOGDescriptor hog;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
-
-        hog.detectMultiScale(img, found_locations);
-
-        TEST_CYCLE()
-        {
-            hog.detectMultiScale(img, found_locations);
-        }
-    }
-
-    SANITY_CHECK(found_locations);
-}
-
-
-///////////////////////////////////////////////////////////////
-// HaarClassifier
-
-typedef pair<string, string> pair_string;
-DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
-
-PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
-{
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::CascadeClassifier_GPU d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
-
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_objects_buffer;
-
-        d_cascade.detectMultiScale(d_img, d_objects_buffer);
-
-        TEST_CYCLE()
-        {
-            d_cascade.detectMultiScale(d_img, d_objects_buffer);
-        }
-
-        GPU_SANITY_CHECK(d_objects_buffer);
-    }
-    else
-    {
-        cv::CascadeClassifier cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
-
-        std::vector<cv::Rect> rects;
-
-        cascade.detectMultiScale(img, rects);
-
-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(img, rects);
-        }
-
-        CPU_SANITY_CHECK(rects);
-    }
-}
-
-///////////////////////////////////////////////////////////////
-// LBP cascade
-
-PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
-    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
-{
-    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
-    ASSERT_FALSE(img.empty());
-
-    if (PERF_RUN_GPU())
-    {
-        cv::gpu::CascadeClassifier_GPU d_cascade;
-        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
-
-        cv::gpu::GpuMat d_img(img);
-        cv::gpu::GpuMat d_gpu_rects;
-
-        d_cascade.detectMultiScale(d_img, d_gpu_rects);
-
-        TEST_CYCLE()
-        {
-            d_cascade.detectMultiScale(d_img, d_gpu_rects);
-        }
-
-        GPU_SANITY_CHECK(d_gpu_rects);
-    }
-    else
-    {
-        cv::CascadeClassifier cascade;
-        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
-
-        std::vector<cv::Rect> rects;
-
-        cascade.detectMultiScale(img, rects);
-
-        TEST_CYCLE()
-        {
-            cascade.detectMultiScale(img, rects);
-        }
-
-        CPU_SANITY_CHECK(rects);
-    }
-}
-
-} // namespace
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace testing;
+
+namespace {
+
+///////////////////////////////////////////////////////////////
+// HOG
+
+DEF_PARAM_TEST_1(Image, string);
+
+PERF_TEST_P(Image, ObjDetect_HOG, Values<string>("gpu/hog/road.png"))
+{
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    std::vector<cv::Rect> found_locations;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_img(img);
+
+        cv::gpu::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
+        d_hog.detectMultiScale(d_img, found_locations);
+
+        TEST_CYCLE()
+        {
+            d_hog.detectMultiScale(d_img, found_locations);
+        }
+    }
+    else
+    {
+        cv::HOGDescriptor hog;
+        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
+        hog.detectMultiScale(img, found_locations);
+
+        TEST_CYCLE()
+        {
+            hog.detectMultiScale(img, found_locations);
+        }
+    }
+
+    SANITY_CHECK(found_locations);
+}
+
+//===========test for CalTech data =============//
+DEF_PARAM_TEST_1(HOG, string);
+
+PERF_TEST_P(HOG, CalTech, Values<string>("gpu/caltech/image_00000009_0.png", "gpu/caltech/image_00000032_0.png",
+    "gpu/caltech/image_00000165_0.png", "gpu/caltech/image_00000261_0.png", "gpu/caltech/image_00000469_0.png",
+    "gpu/caltech/image_00000527_0.png", "gpu/caltech/image_00000574_0.png"))
+{
+    cv::Mat img = readImage(GetParam(), cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    std::vector<cv::Rect> found_locations;
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::GpuMat d_img(img);
+
+        cv::gpu::HOGDescriptor d_hog;
+        d_hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
+        d_hog.detectMultiScale(d_img, found_locations);
+
+        TEST_CYCLE()
+        {
+            d_hog.detectMultiScale(d_img, found_locations);
+        }
+    }
+    else
+    {
+        cv::HOGDescriptor hog;
+        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+
+        hog.detectMultiScale(img, found_locations);
+
+        TEST_CYCLE()
+        {
+            hog.detectMultiScale(img, found_locations);
+        }
+    }
+
+    SANITY_CHECK(found_locations);
+}
+
+
+///////////////////////////////////////////////////////////////
+// HaarClassifier
+
+typedef pair<string, string> pair_string;
+DEF_PARAM_TEST_1(ImageAndCascade, pair_string);
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_HaarClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/perf/haarcascade_frontalface_alt.xml")))
+{
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_objects_buffer;
+
+        d_cascade.detectMultiScale(d_img, d_objects_buffer);
+
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_objects_buffer);
+        }
+
+        GPU_SANITY_CHECK(d_objects_buffer);
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/perf/haarcascade_frontalface_alt.xml")));
+
+        std::vector<cv::Rect> rects;
+
+        cascade.detectMultiScale(img, rects);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(img, rects);
+        }
+
+        CPU_SANITY_CHECK(rects);
+    }
+}
+
+///////////////////////////////////////////////////////////////
+// LBP cascade
+
+PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier,
+    Values<pair_string>(make_pair("gpu/haarcascade/group_1_640x480_VGA.pgm", "gpu/lbpcascade/lbpcascade_frontalface.xml")))
+{
+    cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(img.empty());
+
+    if (PERF_RUN_GPU())
+    {
+        cv::gpu::CascadeClassifier_GPU d_cascade;
+        ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second)));
+
+        cv::gpu::GpuMat d_img(img);
+        cv::gpu::GpuMat d_gpu_rects;
+
+        d_cascade.detectMultiScale(d_img, d_gpu_rects);
+
+        TEST_CYCLE()
+        {
+            d_cascade.detectMultiScale(d_img, d_gpu_rects);
+        }
+
+        GPU_SANITY_CHECK(d_gpu_rects);
+    }
+    else
+    {
+        cv::CascadeClassifier cascade;
+        ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath("gpu/lbpcascade/lbpcascade_frontalface.xml")));
+
+        std::vector<cv::Rect> rects;
+
+        cascade.detectMultiScale(img, rects);
+
+        TEST_CYCLE()
+        {
+            cascade.detectMultiScale(img, rects);
+        }
+
+        CPU_SANITY_CHECK(rects);
+    }
+}
+
+} // namespace
--- a/modules/gpu/perf/perf_precomp.hpp
+++ b/modules/gpu/perf/perf_precomp.hpp
@@ -1,37 +1,37 @@
-#ifdef __GNUC__
-#  pragma GCC diagnostic ignored "-Wmissing-declarations"
-#  pragma GCC diagnostic ignored "-Wmissing-prototypes" //OSX
-#endif
-
-#ifndef __OPENCV_PERF_PRECOMP_HPP__
-#define __OPENCV_PERF_PRECOMP_HPP__
-
-#include <cstdio>
-#include <iostream>
-
-#include "cvconfig.h"
-
-#ifdef HAVE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include "opencv2/ts/ts.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/gpu/gpu.hpp"
-#include "opencv2/calib3d/calib3d.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/video/video.hpp"
-#include "opencv2/nonfree/nonfree.hpp"
-#include "opencv2/legacy/legacy.hpp"
-#include "opencv2/photo/photo.hpp"
-
-#include "utility.hpp"
-
-#ifdef GTEST_CREATE_SHARED_LIBRARY
-#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
-#endif
-
-#endif
+#ifdef __GNUC__
+#  pragma GCC diagnostic ignored "-Wmissing-declarations"
+#  pragma GCC diagnostic ignored "-Wmissing-prototypes" //OSX
+#endif
+
+#ifndef __OPENCV_PERF_PRECOMP_HPP__
+#define __OPENCV_PERF_PRECOMP_HPP__
+
+#include <cstdio>
+#include <iostream>
+
+#include "cvconfig.h"
+
+#ifdef HAVE_CUDA
+#include <cuda_runtime.h>
+#endif
+
+#include "opencv2/ts/ts.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/highgui/highgui.hpp"
+#include "opencv2/gpu/gpu.hpp"
+#include "opencv2/calib3d/calib3d.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/video/video.hpp"
+#include "opencv2/nonfree/nonfree.hpp"
+#include "opencv2/legacy/legacy.hpp"
+#include "opencv2/photo/photo.hpp"
+
+#include "utility.hpp"
+
+#ifdef GTEST_CREATE_SHARED_LIBRARY
+#error no modules except ts should have GTEST_CREATE_SHARED_LIBRARY defined
+#endif
+
+#endif
--- a/modules/gpu/perf/perf_video.cpp
+++ b/modules/gpu/perf/perf_video.cpp
--- a/modules/gpu/perf/utility.cpp
+++ b/modules/gpu/perf/utility.cpp
@@ -1,191 +1,191 @@
-#include "perf_precomp.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-void fillRandom(Mat& m, double a, double b)
-{
-    RNG rng(123456789);
-    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
-}
-
-Mat readImage(const string& fileName, int flags)
-{
-    return imread(perf::TestBase::getDataPath(fileName), flags);
-}
-
-void PrintTo(const CvtColorInfo& info, ostream* os)
-{
-    static const char* str[] =
-    {
-        "BGR2BGRA",
-        "BGRA2BGR",
-        "BGR2RGBA",
-        "RGBA2BGR",
-        "BGR2RGB",
-        "BGRA2RGBA",
-
-        "BGR2GRAY",
-        "RGB2GRAY",
-        "GRAY2BGR",
-        "GRAY2BGRA",
-        "BGRA2GRAY",
-        "RGBA2GRAY",
-
-        "BGR2BGR565",
-        "RGB2BGR565",
-        "BGR5652BGR",
-        "BGR5652RGB",
-        "BGRA2BGR565",
-        "RGBA2BGR565",
-        "BGR5652BGRA",
-        "BGR5652RGBA",
-
-        "GRAY2BGR565",
-        "BGR5652GRAY",
-
-        "BGR2BGR555",
-        "RGB2BGR555",
-        "BGR5552BGR",
-        "BGR5552RGB",
-        "BGRA2BGR555",
-        "RGBA2BGR555",
-        "BGR5552BGRA",
-        "BGR5552RGBA",
-
-        "GRAY2BGR555",
-        "BGR5552GRAY",
-
-        "BGR2XYZ",
-        "RGB2XYZ",
-        "XYZ2BGR",
-        "XYZ2RGB",
-
-        "BGR2YCrCb",
-        "RGB2YCrCb",
-        "YCrCb2BGR",
-        "YCrCb2RGB",
-
-        "BGR2HSV",
-        "RGB2HSV",
-
-        "",
-        "",
-
-        "BGR2Lab",
-        "RGB2Lab",
-
-        "BayerBG2BGR",
-        "BayerGB2BGR",
-        "BayerRG2BGR",
-        "BayerGR2BGR",
-
-        "BGR2Luv",
-        "RGB2Luv",
-
-        "BGR2HLS",
-        "RGB2HLS",
-
-        "HSV2BGR",
-        "HSV2RGB",
-
-        "Lab2BGR",
-        "Lab2RGB",
-        "Luv2BGR",
-        "Luv2RGB",
-
-        "HLS2BGR",
-        "HLS2RGB",
-
-        "BayerBG2BGR_VNG",
-        "BayerGB2BGR_VNG",
-        "BayerRG2BGR_VNG",
-        "BayerGR2BGR_VNG",
-
-        "BGR2HSV_FULL",
-        "RGB2HSV_FULL",
-        "BGR2HLS_FULL",
-        "RGB2HLS_FULL",
-
-        "HSV2BGR_FULL",
-        "HSV2RGB_FULL",
-        "HLS2BGR_FULL",
-        "HLS2RGB_FULL",
-
-        "LBGR2Lab",
-        "LRGB2Lab",
-        "LBGR2Luv",
-        "LRGB2Luv",
-
-        "Lab2LBGR",
-        "Lab2LRGB",
-        "Luv2LBGR",
-        "Luv2LRGB",
-
-        "BGR2YUV",
-        "RGB2YUV",
-        "YUV2BGR",
-        "YUV2RGB",
-
-        "BayerBG2GRAY",
-        "BayerGB2GRAY",
-        "BayerRG2GRAY",
-        "BayerGR2GRAY",
-
-        //YUV 4:2:0 formats family
-        "YUV2RGB_NV12",
-        "YUV2BGR_NV12",
-        "YUV2RGB_NV21",
-        "YUV2BGR_NV21",
-
-        "YUV2RGBA_NV12",
-        "YUV2BGRA_NV12",
-        "YUV2RGBA_NV21",
-        "YUV2BGRA_NV21",
-
-        "YUV2RGB_YV12",
-        "YUV2BGR_YV12",
-        "YUV2RGB_IYUV",
-        "YUV2BGR_IYUV",
-
-        "YUV2RGBA_YV12",
-        "YUV2BGRA_YV12",
-        "YUV2RGBA_IYUV",
-        "YUV2BGRA_IYUV",
-
-        "YUV2GRAY_420",
-
-        //YUV 4:2:2 formats family
-        "YUV2RGB_UYVY",
-        "YUV2BGR_UYVY",
-        "YUV2RGB_VYUY",
-        "YUV2BGR_VYUY",
-
-        "YUV2RGBA_UYVY",
-        "YUV2BGRA_UYVY",
-        "YUV2RGBA_VYUY",
-        "YUV2BGRA_VYUY",
-
-        "YUV2RGB_YUY2",
-        "YUV2BGR_YUY2",
-        "YUV2RGB_YVYU",
-        "YUV2BGR_YVYU",
-
-        "YUV2RGBA_YUY2",
-        "YUV2BGRA_YUY2",
-        "YUV2RGBA_YVYU",
-        "YUV2BGRA_YVYU",
-
-        "YUV2GRAY_UYVY",
-        "YUV2GRAY_YUY2",
-
-        // alpha premultiplication
-        "RGBA2mRGBA",
-        "mRGBA2RGBA",
-
-        "COLORCVT_MAX"
-    };
-
-    *os << str[info.code];
+#include "perf_precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+void fillRandom(Mat& m, double a, double b)
+{
+    RNG rng(123456789);
+    rng.fill(m, RNG::UNIFORM, Scalar::all(a), Scalar::all(b));
+}
+
+Mat readImage(const string& fileName, int flags)
+{
+    return imread(perf::TestBase::getDataPath(fileName), flags);
+}
+
+void PrintTo(const CvtColorInfo& info, ostream* os)
+{
+    static const char* str[] =
+    {
+        "BGR2BGRA",
+        "BGRA2BGR",
+        "BGR2RGBA",
+        "RGBA2BGR",
+        "BGR2RGB",
+        "BGRA2RGBA",
+
+        "BGR2GRAY",
+        "RGB2GRAY",
+        "GRAY2BGR",
+        "GRAY2BGRA",
+        "BGRA2GRAY",
+        "RGBA2GRAY",
+
+        "BGR2BGR565",
+        "RGB2BGR565",
+        "BGR5652BGR",
+        "BGR5652RGB",
+        "BGRA2BGR565",
+        "RGBA2BGR565",
+        "BGR5652BGRA",
+        "BGR5652RGBA",
+
+        "GRAY2BGR565",
+        "BGR5652GRAY",
+
+        "BGR2BGR555",
+        "RGB2BGR555",
+        "BGR5552BGR",
+        "BGR5552RGB",
+        "BGRA2BGR555",
+        "RGBA2BGR555",
+        "BGR5552BGRA",
+        "BGR5552RGBA",
+
+        "GRAY2BGR555",
+        "BGR5552GRAY",
+
+        "BGR2XYZ",
+        "RGB2XYZ",
+        "XYZ2BGR",
+        "XYZ2RGB",
+
+        "BGR2YCrCb",
+        "RGB2YCrCb",
+        "YCrCb2BGR",
+        "YCrCb2RGB",
+
+        "BGR2HSV",
+        "RGB2HSV",
+
+        "",
+        "",
+
+        "BGR2Lab",
+        "RGB2Lab",
+
+        "BayerBG2BGR",
+        "BayerGB2BGR",
+        "BayerRG2BGR",
+        "BayerGR2BGR",
+
+        "BGR2Luv",
+        "RGB2Luv",
+
+        "BGR2HLS",
+        "RGB2HLS",
+
+        "HSV2BGR",
+        "HSV2RGB",
+
+        "Lab2BGR",
+        "Lab2RGB",
+        "Luv2BGR",
+        "Luv2RGB",
+
+        "HLS2BGR",
+        "HLS2RGB",
+
+        "BayerBG2BGR_VNG",
+        "BayerGB2BGR_VNG",
+        "BayerRG2BGR_VNG",
+        "BayerGR2BGR_VNG",
+
+        "BGR2HSV_FULL",
+        "RGB2HSV_FULL",
+        "BGR2HLS_FULL",
+        "RGB2HLS_FULL",
+
+        "HSV2BGR_FULL",
+        "HSV2RGB_FULL",
+        "HLS2BGR_FULL",
+        "HLS2RGB_FULL",
+
+        "LBGR2Lab",
+        "LRGB2Lab",
+        "LBGR2Luv",
+        "LRGB2Luv",
+
+        "Lab2LBGR",
+        "Lab2LRGB",
+        "Luv2LBGR",
+        "Luv2LRGB",
+
+        "BGR2YUV",
+        "RGB2YUV",
+        "YUV2BGR",
+        "YUV2RGB",
+
+        "BayerBG2GRAY",
+        "BayerGB2GRAY",
+        "BayerRG2GRAY",
+        "BayerGR2GRAY",
+
+        //YUV 4:2:0 formats family
+        "YUV2RGB_NV12",
+        "YUV2BGR_NV12",
+        "YUV2RGB_NV21",
+        "YUV2BGR_NV21",
+
+        "YUV2RGBA_NV12",
+        "YUV2BGRA_NV12",
+        "YUV2RGBA_NV21",
+        "YUV2BGRA_NV21",
+
+        "YUV2RGB_YV12",
+        "YUV2BGR_YV12",
+        "YUV2RGB_IYUV",
+        "YUV2BGR_IYUV",
+
+        "YUV2RGBA_YV12",
+        "YUV2BGRA_YV12",
+        "YUV2RGBA_IYUV",
+        "YUV2BGRA_IYUV",
+
+        "YUV2GRAY_420",
+
+        //YUV 4:2:2 formats family
+        "YUV2RGB_UYVY",
+        "YUV2BGR_UYVY",
+        "YUV2RGB_VYUY",
+        "YUV2BGR_VYUY",
+
+        "YUV2RGBA_UYVY",
+        "YUV2BGRA_UYVY",
+        "YUV2RGBA_VYUY",
+        "YUV2BGRA_VYUY",
+
+        "YUV2RGB_YUY2",
+        "YUV2BGR_YUY2",
+        "YUV2RGB_YVYU",
+        "YUV2BGR_YVYU",
+
+        "YUV2RGBA_YUY2",
+        "YUV2BGRA_YUY2",
+        "YUV2RGBA_YVYU",
+        "YUV2BGRA_YVYU",
+
+        "YUV2GRAY_UYVY",
+        "YUV2GRAY_YUY2",
+
+        // alpha premultiplication
+        "RGBA2mRGBA",
+        "mRGBA2RGBA",
+
+        "COLORCVT_MAX"
+    };
+
+    *os << str[info.code];
 }
--- a/modules/gpu/perf/utility.hpp
+++ b/modules/gpu/perf/utility.hpp
@@ -1,84 +1,84 @@
-#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
-#define __OPENCV_PERF_GPU_UTILITY_HPP__
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/core/gpumat.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-#include "opencv2/ts/ts_perf.hpp"
-
-void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
-cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
-
-using perf::MatType;
-using perf::MatDepth;
-
-CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
-#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
-
-CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
-#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
-CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
-
-const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
-CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
-#define GPU_CHANNELS_1_3_4 testing::Values(Gray, BGR, BGRA)
-#define GPU_CHANNELS_1_3 testing::Values(Gray, BGR)
-
-struct CvtColorInfo
-{
-    int scn;
-    int dcn;
-    int code;
-
-    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
-};
-void PrintTo(const CvtColorInfo& info, std::ostream* os);
-
-#define GET_PARAM(k) std::tr1::get< k >(GetParam())
-
-#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
-#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
-
-DEF_PARAM_TEST_1(Sz, cv::Size);
-typedef perf::Size_MatType Sz_Type;
-DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
-DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
-
-#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
-
-#define GPU_SANITY_CHECK(dmat, ...) \
-    do{ \
-        cv::Mat d##dmat(dmat); \
-        SANITY_CHECK(d##dmat, ## __VA_ARGS__); \
-    } while(0)
-
-#define CPU_SANITY_CHECK(cmat, ...) \
-    do{ \
-        SANITY_CHECK(cmat, ## __VA_ARGS__); \
-    } while(0)
-
-#define GPU_SANITY_CHECK_KEYPOINTS(alg, dmat, ...)                                          \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __pt_x      = d##dmat.row(cv::gpu::alg##_GPU::X_ROW);                       \
-        cv::Mat __pt_y      = d##dmat.row(cv::gpu::alg##_GPU::Y_ROW);                       \
-        cv::Mat __angle     = d##dmat.row(cv::gpu::alg##_GPU::ANGLE_ROW);                   \
-        cv::Mat __octave    = d##dmat.row(cv::gpu::alg##_GPU::OCTAVE_ROW);                               \
-        cv::Mat __size      = d##dmat.row(cv::gpu::alg##_GPU::SIZE_ROW);                                 \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-x-row",     __pt_x,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-y-row",     __pt_y,     ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-angle-row",    __angle,    ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "octave-row",    __octave,   ## __VA_ARGS__); \
-        ::perf::Regression::add(this, std::string(#dmat) + "-pt-size-row",  __size,     ## __VA_ARGS__); \
-    } while(0)
-
-#define GPU_SANITY_CHECK_RESPONSE(alg, dmat, ...) \
-    do{                                                                                     \
-        cv::Mat d##dmat(dmat);                                                              \
-        cv::Mat __response  = d##dmat.row(cv::gpu::alg##_GPU::RESPONSE_ROW);                \
-        ::perf::Regression::add(this, std::string(#dmat) + "-response-row", __response, ## __VA_ARGS__); \
-    } while(0)
-
-#define FAIL_NO_CPU()   FAIL() << "No such CPU implementation analogy"
-
-#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
+#ifndef __OPENCV_PERF_GPU_UTILITY_HPP__
+#define __OPENCV_PERF_GPU_UTILITY_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/gpumat.hpp"
+#include "opencv2/imgproc/imgproc.hpp"
+#include "opencv2/ts/ts_perf.hpp"
+
+void fillRandom(cv::Mat& m, double a = 0.0, double b = 255.0);
+cv::Mat readImage(const std::string& fileName, int flags = cv::IMREAD_COLOR);
+
+using perf::MatType;
+using perf::MatDepth;
+
+CV_ENUM(BorderMode, cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT, cv::BORDER_REFLECT, cv::BORDER_WRAP)
+#define ALL_BORDER_MODES testing::ValuesIn(BorderMode::all())
+
+CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC, cv::INTER_AREA)
+#define ALL_INTERPOLATIONS testing::ValuesIn(Interpolation::all())
+CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2, cv::NORM_HAMMING)
+
+const int Gray = 1, TwoChannel = 2, BGR = 3, BGRA = 4;
+CV_ENUM(MatCn, Gray, TwoChannel, BGR, BGRA)
+#define GPU_CHANNELS_1_3_4 testing::Values(Gray, BGR, BGRA)
+#define GPU_CHANNELS_1_3 testing::Values(Gray, BGR)
+
+struct CvtColorInfo
+{
+    int scn;
+    int dcn;
+    int code;
+
+    explicit CvtColorInfo(int scn_=0, int dcn_=0, int code_=0) : scn(scn_), dcn(dcn_), code(code_) {}
+};
+void PrintTo(const CvtColorInfo& info, std::ostream* os);
+
+#define GET_PARAM(k) std::tr1::get< k >(GetParam())
+
+#define DEF_PARAM_TEST(name, ...) typedef ::perf::TestBaseWithParam< std::tr1::tuple< __VA_ARGS__ > > name
+#define DEF_PARAM_TEST_1(name, param_type) typedef ::perf::TestBaseWithParam< param_type > name
+
+DEF_PARAM_TEST_1(Sz, cv::Size);
+typedef perf::Size_MatType Sz_Type;
+DEF_PARAM_TEST(Sz_Depth, cv::Size, MatDepth);
+DEF_PARAM_TEST(Sz_Depth_Cn, cv::Size, MatDepth, MatCn);
+
+#define GPU_TYPICAL_MAT_SIZES testing::Values(perf::sz720p, perf::szSXGA, perf::sz1080p)
+
+#define GPU_SANITY_CHECK(dmat, ...) \
+    do{ \
+        cv::Mat d##dmat(dmat); \
+        SANITY_CHECK(d##dmat, ## __VA_ARGS__); \
+    } while(0)
+
+#define CPU_SANITY_CHECK(cmat, ...) \
+    do{ \
+        SANITY_CHECK(cmat, ## __VA_ARGS__); \
+    } while(0)
+
+#define GPU_SANITY_CHECK_KEYPOINTS(alg, dmat, ...)                                          \
+    do{                                                                                     \
+        cv::Mat d##dmat(dmat);                                                              \
+        cv::Mat __pt_x      = d##dmat.row(cv::gpu::alg##_GPU::X_ROW);                       \
+        cv::Mat __pt_y      = d##dmat.row(cv::gpu::alg##_GPU::Y_ROW);                       \
+        cv::Mat __angle     = d##dmat.row(cv::gpu::alg##_GPU::ANGLE_ROW);                   \
+        cv::Mat __octave    = d##dmat.row(cv::gpu::alg##_GPU::OCTAVE_ROW);                               \
+        cv::Mat __size      = d##dmat.row(cv::gpu::alg##_GPU::SIZE_ROW);                                 \
+        ::perf::Regression::add(this, std::string(#dmat) + "-pt-x-row",     __pt_x,     ## __VA_ARGS__); \
+        ::perf::Regression::add(this, std::string(#dmat) + "-pt-y-row",     __pt_y,     ## __VA_ARGS__); \
+        ::perf::Regression::add(this, std::string(#dmat) + "-angle-row",    __angle,    ## __VA_ARGS__); \
+        ::perf::Regression::add(this, std::string(#dmat) + "octave-row",    __octave,   ## __VA_ARGS__); \
+        ::perf::Regression::add(this, std::string(#dmat) + "-pt-size-row",  __size,     ## __VA_ARGS__); \
+    } while(0)
+
+#define GPU_SANITY_CHECK_RESPONSE(alg, dmat, ...) \
+    do{                                                                                     \
+        cv::Mat d##dmat(dmat);                                                              \
+        cv::Mat __response  = d##dmat.row(cv::gpu::alg##_GPU::RESPONSE_ROW);                \
+        ::perf::Regression::add(this, std::string(#dmat) + "-response-row", __response, ## __VA_ARGS__); \
+    } while(0)
+
+#define FAIL_NO_CPU()   FAIL() << "No such CPU implementation analogy"
+
+#endif // __OPENCV_PERF_GPU_UTILITY_HPP__
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -1,158 +1,158 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace std;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); }
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace disp_bilateral_filter
-    {
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
-
-        template<typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::device::disp_bilateral_filter;
-
-namespace
-{
-    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
-    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
-    const float DEFAULT_SIGMA_RANGE = 10.0f;
-
-    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
-    {
-        Mat cpu_table_color(1, len, CV_32F);
-
-        float* line = cpu_table_color.ptr<float>();
-
-        for(int i = 0; i < len; i++) 
-            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
-
-        table_color.upload(cpu_table_color);
-    }
-
-    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
-    {
-        int half = (win_size >> 1);
-
-        Mat cpu_table_space(half + 1, half + 1, CV_32F);
-
-        for (int y = 0; y <= half; ++y)
-        {
-            float* row = cpu_table_space.ptr<float>(y);
-            for (int x = 0; x <= half; ++x)
-                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
-        }
-
-        table_space.upload(cpu_table_space);
-    }
-
-    template <typename T>
-    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold, 
-                                   GpuMat& table_color, GpuMat& table_space, 
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-    {
-        short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
-        short max_disc = short(ndisp * max_disc_threshold + 0.5);
-
-        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
-
-        if (&dst != &disp)
-        {
-            if (stream)
-                stream.enqueueCopy(disp, dst);
-            else
-                disp.copyTo(dst);
-        }
-
-        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
-    }
-
-    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, 
-                                                GpuMat& table_color, GpuMat& table_space, 
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
-    
-    const bilateral_filter_operator_t operators[] = 
-        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
-      sigma_range(DEFAULT_SIGMA_RANGE)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_, 
-                                                     float max_disc_threshold_, float sigma_range_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_), 
-      sigma_range(sigma_range_)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
-}
-
-#endif /* !defined (HAVE_CUDA) */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_nogpu(); }
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_nogpu(); }
+
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace disp_bilateral_filter
+    {
+        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
+
+        template<typename T>
+        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::device::disp_bilateral_filter;
+
+namespace
+{
+    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
+    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
+    const float DEFAULT_SIGMA_RANGE = 10.0f;
+
+    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
+    {
+        Mat cpu_table_color(1, len, CV_32F);
+
+        float* line = cpu_table_color.ptr<float>();
+
+        for(int i = 0; i < len; i++)
+            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
+
+        table_color.upload(cpu_table_color);
+    }
+
+    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
+    {
+        int half = (win_size >> 1);
+
+        Mat cpu_table_space(half + 1, half + 1, CV_32F);
+
+        for (int y = 0; y <= half; ++y)
+        {
+            float* row = cpu_table_space.ptr<float>(y);
+            for (int x = 0; x <= half; ++x)
+                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
+        }
+
+        table_space.upload(cpu_table_space);
+    }
+
+    template <typename T>
+    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
+                                   GpuMat& table_color, GpuMat& table_space,
+                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+    {
+        short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
+        short max_disc = short(ndisp * max_disc_threshold + 0.5);
+
+        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+
+        if (&dst != &disp)
+        {
+            if (stream)
+                stream.enqueueCopy(disp, dst);
+            else
+                disp.copyTo(dst);
+        }
+
+        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+    }
+
+    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
+                                                GpuMat& table_color, GpuMat& table_space,
+                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
+
+    const bilateral_filter_operator_t operators[] =
+        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
+}
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
+    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
+      sigma_range(DEFAULT_SIGMA_RANGE)
+{
+    calc_color_weighted_table(table_color, sigma_range, 255);
+    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
+}
+
+cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
+                                                     float max_disc_threshold_, float sigma_range_)
+    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
+      sigma_range(sigma_range_)
+{
+    calc_color_weighted_table(table_color, sigma_range, 255);
+    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
+}
+
+void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
+{
+    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
+    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
+    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -1,100 +1,100 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-
-#else
-
-namespace cv { namespace gpu { namespace device 
-{
-    namespace blend
-    {
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::device::blend;
-
-void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
-                          GpuMat& result, Stream& stream)
-{
-    CV_Assert(img1.size() == img2.size());
-    CV_Assert(img1.type() == img2.type());
-    CV_Assert(weights1.size() == img1.size());
-    CV_Assert(weights2.size() == img2.size());
-    CV_Assert(weights1.type() == CV_32F);
-    CV_Assert(weights2.type() == CV_32F);
-
-    const Size size = img1.size();
-    const int depth = img1.depth();
-    const int cn = img1.channels();
-
-    result.create(size, CV_MAKE_TYPE(depth, cn));
-
-    switch (depth)
-    {
-    case CV_8U:
-        if (cn != 4)
-            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        else
-            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    case CV_32F:
-        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    default:
-        CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function");
-    }
-}
-
-#endif
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+#else
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::gpu::device::blend;
+
+void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
+                          GpuMat& result, Stream& stream)
+{
+    CV_Assert(img1.size() == img2.size());
+    CV_Assert(img1.type() == img2.type());
+    CV_Assert(weights1.size() == img1.size());
+    CV_Assert(weights2.size() == img2.size());
+    CV_Assert(weights1.type() == CV_32F);
+    CV_Assert(weights2.type() == CV_32F);
+
+    const Size size = img1.size();
+    const int depth = img1.depth();
+    const int cn = img1.channels();
+
+    result.create(size, CV_MAKE_TYPE(depth, cn));
+
+    switch (depth)
+    {
+    case CV_8U:
+        if (cn != 4)
+            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        else
+            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    case CV_32F:
+        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    default:
+        CV_Error(CV_StsUnsupportedFormat, "bad image depth in linear blending function");
+    }
+}
+
+#endif
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -1,295 +1,295 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace std;
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
-
-void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
-
-void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
-
-#else
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace transform_points
-    {
-        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
-    }
-
-    namespace project_points
-    {
-        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
-    }
-
-    namespace solve_pnp_ransac
-    {
-        int maxNumIters();
-
-        void computeHypothesisScores(
-                const int num_hypotheses, const int num_points, const float* rot_matrices,
-                const float3* transl_vectors, const float3* object, const float2* image,
-                const float dist_threshold, int* hypothesis_scores);
-    }
-}}}
-
-using namespace ::cv::gpu::device;
-
-namespace
-{
-    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
-        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
-        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
-
-        // Convert rotation vector into matrix
-        Mat rot;
-        Rodrigues(rvec, rot);
-
-        dst.create(src.size(), src.type());
-        transform_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), dst, stream);
-    }
-}
-
-void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
-{
-    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
-}
-
-namespace
-{
-    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
-    {
-        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
-        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
-        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
-        CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
-        CV_Assert(dist_coef.empty()); // Undistortion isn't supported
-
-        // Convert rotation vector into matrix
-        Mat rot;
-        Rodrigues(rvec, rot);
-
-        dst.create(src.size(), CV_32FC2);
-        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
-    }
-}
-
-void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
-{
-    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
-}
-
-namespace
-{
-    // Selects subset_size random different points from [0, num_points - 1] range
-    void selectRandom(int subset_size, int num_points, vector<int>& subset)
-    {
-        subset.resize(subset_size);
-        for (int i = 0; i < subset_size; ++i)
-        {
-            bool was;
-            do
-            {
-                subset[i] = rand() % num_points;
-                was = false;
-                for (int j = 0; j < i; ++j)
-                    if (subset[j] == subset[i])
-                    {
-                        was = true;
-                        break;
-                    }
-            } while (was);
-        }
-    }
-
-    // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator
-    {
-    public:
-        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
-                                     const Mat& camera_mat_, int num_points_, int subset_size_,
-                                     Mat rot_matrices_, Mat transl_vectors_)
-                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
-                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
-                  transl_vectors(transl_vectors_) {}
-
-        void operator()(const BlockedRange& range) const
-        {
-            // Input data for generation of the current hypothesis
-            vector<int> subset_indices(subset_size);
-            Mat_<Point3f> object_subset(1, subset_size);
-            Mat_<Point2f> image_subset(1, subset_size);
-
-            // Current hypothesis data
-            Mat rot_vec(1, 3, CV_64F);
-            Mat rot_mat(3, 3, CV_64F);
-            Mat transl_vec(1, 3, CV_64F);
-
-            for (int iter = range.begin(); iter < range.end(); ++iter)
-            {
-                selectRandom(subset_size, num_points, subset_indices);
-                for (int i = 0; i < subset_size; ++i)
-                {
-                   object_subset(0, i) = object->at<Point3f>(subset_indices[i]);
-                   image_subset(0, i) = image->at<Point2f>(subset_indices[i]);
-                }
-
-                solvePnP(object_subset, image_subset, *camera_mat, *dist_coef, rot_vec, transl_vec);
-
-                // Remember translation vector
-                Mat transl_vec_ = transl_vectors.colRange(iter * 3, (iter + 1) * 3);
-                transl_vec = transl_vec.reshape(0, 1);
-                transl_vec.convertTo(transl_vec_, CV_32F);
-
-                // Remember rotation matrix
-                Rodrigues(rot_vec, rot_mat);
-                Mat rot_mat_ = rot_matrices.colRange(iter * 9, (iter + 1) * 9).reshape(0, 3);
-                rot_mat.convertTo(rot_mat_, CV_32F);
-            }
-        }
-
-        const Mat* object;
-        const Mat* image;
-        const Mat* dist_coef;
-        const Mat* camera_mat;
-        int num_points;
-        int subset_size;
-
-        // Hypotheses storage (global)
-        Mat rot_matrices;
-        Mat transl_vectors;
-    };
-}
-
-void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
-                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
-                             int num_iters, float max_dist, int min_inlier_count,
-                             vector<int>* inliers)
-{
-    (void)min_inlier_count;
-    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
-    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
-    CV_Assert(object.cols == image.cols);
-    CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
-    CV_Assert(!use_extrinsic_guess); // We don't support initial guess for now
-    CV_Assert(num_iters <= solve_pnp_ransac::maxNumIters());
-
-    const int subset_size = 4;
-    const int num_points = object.cols;
-    CV_Assert(num_points >= subset_size);
-
-    // Unapply distortion and intrinsic camera transformations
-    Mat eye_camera_mat = Mat::eye(3, 3, CV_32F);
-    Mat empty_dist_coef;
-    Mat image_normalized;
-    undistortPoints(image, image_normalized, camera_mat, dist_coef, Mat(), eye_camera_mat);
-
-    // Hypotheses storage (global)
-    Mat rot_matrices(1, num_iters * 9, CV_32F);
-    Mat transl_vectors(1, num_iters * 3, CV_32F);
-
-    // Generate set of hypotheses using small subsets of the input data
-    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
-                                      num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for(BlockedRange(0, num_iters), body);
-
-    // Compute scores (i.e. number of inliers) for each hypothesis
-    GpuMat d_object(object);
-    GpuMat d_image_normalized(image_normalized);
-    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
-    solve_pnp_ransac::computeHypothesisScores(
-            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
-            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
-            d_hypothesis_scores.ptr<int>());
-
-    // Find the best hypothesis index
-    Point best_idx;
-    double best_score;
-    minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
-    int num_inliers = static_cast<int>(best_score);
-
-    // Extract the best hypothesis data
-
-    Mat rot_mat = rot_matrices.colRange(best_idx.x * 9, (best_idx.x + 1) * 9).reshape(0, 3);
-    Rodrigues(rot_mat, rvec);
-    rvec = rvec.reshape(0, 1);
-
-    tvec = transl_vectors.colRange(best_idx.x * 3, (best_idx.x + 1) * 3).clone();
-    tvec = tvec.reshape(0, 1);
-
-    // Build vector of inlier indices
-    if (inliers != NULL)
-    {
-        inliers->clear();
-        inliers->reserve(num_inliers);
-
-        Point3f p, p_transf;
-        Point2f p_proj;
-        const float* rot = rot_mat.ptr<float>();
-        const float* transl = tvec.ptr<float>();
-
-        for (int i = 0; i < num_points; ++i)
-        {
-            p = object.at<Point3f>(0, i);
-            p_transf.x = rot[0] * p.x + rot[1] * p.y + rot[2] * p.z + transl[0];
-            p_transf.y = rot[3] * p.x + rot[4] * p.y + rot[5] * p.z + transl[1];
-            p_transf.z = rot[6] * p.x + rot[7] * p.y + rot[8] * p.z + transl[2];
-            p_proj.x = p_transf.x / p_transf.z;
-            p_proj.y = p_transf.y / p_transf.z;
-            if (norm(p_proj - image_normalized.at<Point2f>(0, i)) < max_dist)
-                inliers->push_back(i);
-        }
-    }
-}
-
-#endif
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+void cv::gpu::projectPoints(const GpuMat&, const Mat&, const Mat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
+
+void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat&, Mat&, bool, int, float, int, vector<int>*) { throw_nogpu(); }
+
+#else
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace transform_points
+    {
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
+    }
+
+    namespace project_points
+    {
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
+    }
+
+    namespace solve_pnp_ransac
+    {
+        int maxNumIters();
+
+        void computeHypothesisScores(
+                const int num_hypotheses, const int num_points, const float* rot_matrices,
+                const float3* transl_vectors, const float3* object, const float2* image,
+                const float dist_threshold, int* hypothesis_scores);
+    }
+}}}
+
+using namespace ::cv::gpu::device;
+
+namespace
+{
+    void transformPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
+        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
+        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
+
+        // Convert rotation vector into matrix
+        Mat rot;
+        Rodrigues(rvec, rot);
+
+        dst.create(src.size(), src.type());
+        transform_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), dst, stream);
+    }
+}
+
+void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
+{
+    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
+    {
+        CV_Assert(src.rows == 1 && src.cols > 0 && src.type() == CV_32FC3);
+        CV_Assert(rvec.size() == Size(3, 1) && rvec.type() == CV_32F);
+        CV_Assert(tvec.size() == Size(3, 1) && tvec.type() == CV_32F);
+        CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
+        CV_Assert(dist_coef.empty()); // Undistortion isn't supported
+
+        // Convert rotation vector into matrix
+        Mat rot;
+        Rodrigues(rvec, rot);
+
+        dst.create(src.size(), CV_32FC2);
+        project_points::call(src, rot.ptr<float>(), tvec.ptr<float>(), camera_mat.ptr<float>(), dst,stream);
+    }
+}
+
+void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
+{
+    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
+}
+
+namespace
+{
+    // Selects subset_size random different points from [0, num_points - 1] range
+    void selectRandom(int subset_size, int num_points, vector<int>& subset)
+    {
+        subset.resize(subset_size);
+        for (int i = 0; i < subset_size; ++i)
+        {
+            bool was;
+            do
+            {
+                subset[i] = rand() % num_points;
+                was = false;
+                for (int j = 0; j < i; ++j)
+                    if (subset[j] == subset[i])
+                    {
+                        was = true;
+                        break;
+                    }
+            } while (was);
+        }
+    }
+
+    // Computes rotation, translation pair for small subsets if the input data
+    class TransformHypothesesGenerator
+    {
+    public:
+        TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
+                                     const Mat& camera_mat_, int num_points_, int subset_size_,
+                                     Mat rot_matrices_, Mat transl_vectors_)
+                : object(&object_), image(&image_), dist_coef(&dist_coef_), camera_mat(&camera_mat_),
+                  num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
+                  transl_vectors(transl_vectors_) {}
+
+        void operator()(const BlockedRange& range) const
+        {
+            // Input data for generation of the current hypothesis
+            vector<int> subset_indices(subset_size);
+            Mat_<Point3f> object_subset(1, subset_size);
+            Mat_<Point2f> image_subset(1, subset_size);
+
+            // Current hypothesis data
+            Mat rot_vec(1, 3, CV_64F);
+            Mat rot_mat(3, 3, CV_64F);
+            Mat transl_vec(1, 3, CV_64F);
+
+            for (int iter = range.begin(); iter < range.end(); ++iter)
+            {
+                selectRandom(subset_size, num_points, subset_indices);
+                for (int i = 0; i < subset_size; ++i)
+                {
+                   object_subset(0, i) = object->at<Point3f>(subset_indices[i]);
+                   image_subset(0, i) = image->at<Point2f>(subset_indices[i]);
+                }
+
+                solvePnP(object_subset, image_subset, *camera_mat, *dist_coef, rot_vec, transl_vec);
+
+                // Remember translation vector
+                Mat transl_vec_ = transl_vectors.colRange(iter * 3, (iter + 1) * 3);
+                transl_vec = transl_vec.reshape(0, 1);
+                transl_vec.convertTo(transl_vec_, CV_32F);
+
+                // Remember rotation matrix
+                Rodrigues(rot_vec, rot_mat);
+                Mat rot_mat_ = rot_matrices.colRange(iter * 9, (iter + 1) * 9).reshape(0, 3);
+                rot_mat.convertTo(rot_mat_, CV_32F);
+            }
+        }
+
+        const Mat* object;
+        const Mat* image;
+        const Mat* dist_coef;
+        const Mat* camera_mat;
+        int num_points;
+        int subset_size;
+
+        // Hypotheses storage (global)
+        Mat rot_matrices;
+        Mat transl_vectors;
+    };
+}
+
+void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& camera_mat,
+                             const Mat& dist_coef, Mat& rvec, Mat& tvec, bool use_extrinsic_guess,
+                             int num_iters, float max_dist, int min_inlier_count,
+                             vector<int>* inliers)
+{
+    (void)min_inlier_count;
+    CV_Assert(object.rows == 1 && object.cols > 0 && object.type() == CV_32FC3);
+    CV_Assert(image.rows == 1 && image.cols > 0 && image.type() == CV_32FC2);
+    CV_Assert(object.cols == image.cols);
+    CV_Assert(camera_mat.size() == Size(3, 3) && camera_mat.type() == CV_32F);
+    CV_Assert(!use_extrinsic_guess); // We don't support initial guess for now
+    CV_Assert(num_iters <= solve_pnp_ransac::maxNumIters());
+
+    const int subset_size = 4;
+    const int num_points = object.cols;
+    CV_Assert(num_points >= subset_size);
+
+    // Unapply distortion and intrinsic camera transformations
+    Mat eye_camera_mat = Mat::eye(3, 3, CV_32F);
+    Mat empty_dist_coef;
+    Mat image_normalized;
+    undistortPoints(image, image_normalized, camera_mat, dist_coef, Mat(), eye_camera_mat);
+
+    // Hypotheses storage (global)
+    Mat rot_matrices(1, num_iters * 9, CV_32F);
+    Mat transl_vectors(1, num_iters * 3, CV_32F);
+
+    // Generate set of hypotheses using small subsets of the input data
+    TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
+                                      num_points, subset_size, rot_matrices, transl_vectors);
+    parallel_for(BlockedRange(0, num_iters), body);
+
+    // Compute scores (i.e. number of inliers) for each hypothesis
+    GpuMat d_object(object);
+    GpuMat d_image_normalized(image_normalized);
+    GpuMat d_hypothesis_scores(1, num_iters, CV_32S);
+    solve_pnp_ransac::computeHypothesisScores(
+            num_iters, num_points, rot_matrices.ptr<float>(), transl_vectors.ptr<float3>(),
+            d_object.ptr<float3>(), d_image_normalized.ptr<float2>(), max_dist * max_dist,
+            d_hypothesis_scores.ptr<int>());
+
+    // Find the best hypothesis index
+    Point best_idx;
+    double best_score;
+    minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
+    int num_inliers = static_cast<int>(best_score);
+
+    // Extract the best hypothesis data
+
+    Mat rot_mat = rot_matrices.colRange(best_idx.x * 9, (best_idx.x + 1) * 9).reshape(0, 3);
+    Rodrigues(rot_mat, rvec);
+    rvec = rvec.reshape(0, 1);
+
+    tvec = transl_vectors.colRange(best_idx.x * 3, (best_idx.x + 1) * 3).clone();
+    tvec = tvec.reshape(0, 1);
+
+    // Build vector of inlier indices
+    if (inliers != NULL)
+    {
+        inliers->clear();
+        inliers->reserve(num_inliers);
+
+        Point3f p, p_transf;
+        Point2f p_proj;
+        const float* rot = rot_mat.ptr<float>();
+        const float* transl = tvec.ptr<float>();
+
+        for (int i = 0; i < num_points; ++i)
+        {
+            p = object.at<Point3f>(0, i);
+            p_transf.x = rot[0] * p.x + rot[1] * p.y + rot[2] * p.z + transl[0];
+            p_transf.y = rot[3] * p.x + rot[4] * p.y + rot[5] * p.z + transl[1];
+            p_transf.z = rot[6] * p.x + rot[7] * p.y + rot[8] * p.z + transl[2];
+            p_proj.x = p_transf.x / p_transf.z;
+            p_proj.y = p_transf.y / p_transf.z;
+            if (norm(p_proj - image_normalized.at<Point2f>(0, i)) < max_dist)
+                inliers->push_back(i);
+        }
+    }
+}
+
+#endif
+
+
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cu_safe_call.cpp
+++ b/modules/gpu/src/cu_safe_call.cpp
@@ -1,139 +1,139 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "cu_safe_call.h"
-
-#ifdef HAVE_CUDA
-
-namespace
-{    
-    #define error_entry(entry)  { entry, #entry }
-
-    struct ErrorEntry
-    {
-        int code;
-        std::string str;
-    }; 
-
-    class ErrorEntryComparer
-    {
-    public:
-        inline ErrorEntryComparer(int code) : code_(code) {}
-
-        inline bool operator()(const ErrorEntry& e) const { return e.code == code_; }
-
-    private:
-        int code_;
-    };
-
-    std::string getErrorString(int code, const ErrorEntry* errors, size_t n)
-    {
-        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
-
-        const std::string& msg = (idx != n) ? errors[idx].str : std::string("Unknown error code");
-
-        std::ostringstream ostr;
-        ostr << msg << " [Code = " << code << "]";
-
-        return ostr.str();
-    }
-
-    const ErrorEntry cu_errors [] = 
-    {
-        error_entry( CUDA_SUCCESS                              ),
-        error_entry( CUDA_ERROR_INVALID_VALUE                  ),
-        error_entry( CUDA_ERROR_OUT_OF_MEMORY                  ),
-        error_entry( CUDA_ERROR_NOT_INITIALIZED                ),
-        error_entry( CUDA_ERROR_DEINITIALIZED                  ),
-        error_entry( CUDA_ERROR_PROFILER_DISABLED              ),
-        error_entry( CUDA_ERROR_PROFILER_NOT_INITIALIZED       ),
-        error_entry( CUDA_ERROR_PROFILER_ALREADY_STARTED       ),
-        error_entry( CUDA_ERROR_PROFILER_ALREADY_STOPPED       ),  
-        error_entry( CUDA_ERROR_NO_DEVICE                      ),
-        error_entry( CUDA_ERROR_INVALID_DEVICE                 ),
-        error_entry( CUDA_ERROR_INVALID_IMAGE                  ),
-        error_entry( CUDA_ERROR_INVALID_CONTEXT                ),
-        error_entry( CUDA_ERROR_CONTEXT_ALREADY_CURRENT        ),
-        error_entry( CUDA_ERROR_MAP_FAILED                     ),
-        error_entry( CUDA_ERROR_UNMAP_FAILED                   ),
-        error_entry( CUDA_ERROR_ARRAY_IS_MAPPED                ),
-        error_entry( CUDA_ERROR_ALREADY_MAPPED                 ),
-        error_entry( CUDA_ERROR_NO_BINARY_FOR_GPU              ),
-        error_entry( CUDA_ERROR_ALREADY_ACQUIRED               ),
-        error_entry( CUDA_ERROR_NOT_MAPPED                     ),
-        error_entry( CUDA_ERROR_NOT_MAPPED_AS_ARRAY            ),
-        error_entry( CUDA_ERROR_NOT_MAPPED_AS_POINTER          ),
-        error_entry( CUDA_ERROR_ECC_UNCORRECTABLE              ),
-        error_entry( CUDA_ERROR_UNSUPPORTED_LIMIT              ),
-        error_entry( CUDA_ERROR_CONTEXT_ALREADY_IN_USE         ),
-        error_entry( CUDA_ERROR_INVALID_SOURCE                 ),
-        error_entry( CUDA_ERROR_FILE_NOT_FOUND                 ),
-        error_entry( CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ),
-        error_entry( CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      ),
-        error_entry( CUDA_ERROR_OPERATING_SYSTEM               ),
-        error_entry( CUDA_ERROR_INVALID_HANDLE                 ),
-        error_entry( CUDA_ERROR_NOT_FOUND                      ),
-        error_entry( CUDA_ERROR_NOT_READY                      ),
-        error_entry( CUDA_ERROR_LAUNCH_FAILED                  ),
-        error_entry( CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        ),
-        error_entry( CUDA_ERROR_LAUNCH_TIMEOUT                 ),
-        error_entry( CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  ),
-        error_entry( CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    ),
-        error_entry( CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        ),
-        error_entry( CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         ),
-        error_entry( CUDA_ERROR_CONTEXT_IS_DESTROYED           ),
-        error_entry( CUDA_ERROR_ASSERT                         ),
-        error_entry( CUDA_ERROR_TOO_MANY_PEERS                 ),
-        error_entry( CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ),
-        error_entry( CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     ),
-        error_entry( CUDA_ERROR_UNKNOWN                        )
-    };
-
-    const size_t cu_errors_num = sizeof(cu_errors) / sizeof(cu_errors[0]);
-}
-
-std::string cv::gpu::detail::cuGetErrString(CUresult res)
-{
-    return getErrorString(res, cu_errors, cu_errors_num);
-}
-
-#endif // HAVE_CUDA
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "cu_safe_call.h"
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        std::string str;
+    };
+
+    class ErrorEntryComparer
+    {
+    public:
+        inline ErrorEntryComparer(int code) : code_(code) {}
+
+        inline bool operator()(const ErrorEntry& e) const { return e.code == code_; }
+
+    private:
+        int code_;
+    };
+
+    std::string getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const std::string& msg = (idx != n) ? errors[idx].str : std::string("Unknown error code");
+
+        std::ostringstream ostr;
+        ostr << msg << " [Code = " << code << "]";
+
+        return ostr.str();
+    }
+
+    const ErrorEntry cu_errors [] =
+    {
+        error_entry( CUDA_SUCCESS                              ),
+        error_entry( CUDA_ERROR_INVALID_VALUE                  ),
+        error_entry( CUDA_ERROR_OUT_OF_MEMORY                  ),
+        error_entry( CUDA_ERROR_NOT_INITIALIZED                ),
+        error_entry( CUDA_ERROR_DEINITIALIZED                  ),
+        error_entry( CUDA_ERROR_PROFILER_DISABLED              ),
+        error_entry( CUDA_ERROR_PROFILER_NOT_INITIALIZED       ),
+        error_entry( CUDA_ERROR_PROFILER_ALREADY_STARTED       ),
+        error_entry( CUDA_ERROR_PROFILER_ALREADY_STOPPED       ),
+        error_entry( CUDA_ERROR_NO_DEVICE                      ),
+        error_entry( CUDA_ERROR_INVALID_DEVICE                 ),
+        error_entry( CUDA_ERROR_INVALID_IMAGE                  ),
+        error_entry( CUDA_ERROR_INVALID_CONTEXT                ),
+        error_entry( CUDA_ERROR_CONTEXT_ALREADY_CURRENT        ),
+        error_entry( CUDA_ERROR_MAP_FAILED                     ),
+        error_entry( CUDA_ERROR_UNMAP_FAILED                   ),
+        error_entry( CUDA_ERROR_ARRAY_IS_MAPPED                ),
+        error_entry( CUDA_ERROR_ALREADY_MAPPED                 ),
+        error_entry( CUDA_ERROR_NO_BINARY_FOR_GPU              ),
+        error_entry( CUDA_ERROR_ALREADY_ACQUIRED               ),
+        error_entry( CUDA_ERROR_NOT_MAPPED                     ),
+        error_entry( CUDA_ERROR_NOT_MAPPED_AS_ARRAY            ),
+        error_entry( CUDA_ERROR_NOT_MAPPED_AS_POINTER          ),
+        error_entry( CUDA_ERROR_ECC_UNCORRECTABLE              ),
+        error_entry( CUDA_ERROR_UNSUPPORTED_LIMIT              ),
+        error_entry( CUDA_ERROR_CONTEXT_ALREADY_IN_USE         ),
+        error_entry( CUDA_ERROR_INVALID_SOURCE                 ),
+        error_entry( CUDA_ERROR_FILE_NOT_FOUND                 ),
+        error_entry( CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND ),
+        error_entry( CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      ),
+        error_entry( CUDA_ERROR_OPERATING_SYSTEM               ),
+        error_entry( CUDA_ERROR_INVALID_HANDLE                 ),
+        error_entry( CUDA_ERROR_NOT_FOUND                      ),
+        error_entry( CUDA_ERROR_NOT_READY                      ),
+        error_entry( CUDA_ERROR_LAUNCH_FAILED                  ),
+        error_entry( CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        ),
+        error_entry( CUDA_ERROR_LAUNCH_TIMEOUT                 ),
+        error_entry( CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  ),
+        error_entry( CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    ),
+        error_entry( CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        ),
+        error_entry( CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         ),
+        error_entry( CUDA_ERROR_CONTEXT_IS_DESTROYED           ),
+        error_entry( CUDA_ERROR_ASSERT                         ),
+        error_entry( CUDA_ERROR_TOO_MANY_PEERS                 ),
+        error_entry( CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED ),
+        error_entry( CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     ),
+        error_entry( CUDA_ERROR_UNKNOWN                        )
+    };
+
+    const size_t cu_errors_num = sizeof(cu_errors) / sizeof(cu_errors[0]);
+}
+
+std::string cv::gpu::detail::cuGetErrString(CUresult res)
+{
+    return getErrorString(res, cu_errors, cu_errors_num);
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/src/cu_safe_call.h
+++ b/modules/gpu/src/cu_safe_call.h
@@ -1,67 +1,67 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __CU_SAFE_CALL_H__
-#define __CU_SAFE_CALL_H__
-
-#include "precomp.hpp"
-
-#ifdef HAVE_CUDA
-
-namespace cv { namespace gpu { 
-    namespace detail
-    {
-        std::string cuGetErrString(CUresult res);
-
-        inline void cuSafeCall_impl(CUresult res, const char* file, int line)
-        {
-            if (res != CUDA_SUCCESS)
-                cv::error( cv::Exception(CV_GpuApiCallError, cuGetErrString(res), "unknown function", file, line) );
-        }
-    }
-}}
-
-#define cuSafeCall( op ) cv::gpu::detail::cuSafeCall_impl( (op), __FILE__, __LINE__ )
-
-#endif // HAVE_CUDA
-
-#endif // __CU_SAFE_CALL_H__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __CU_SAFE_CALL_H__
+#define __CU_SAFE_CALL_H__
+
+#include "precomp.hpp"
+
+#ifdef HAVE_CUDA
+
+namespace cv { namespace gpu {
+    namespace detail
+    {
+        std::string cuGetErrString(CUresult res);
+
+        inline void cuSafeCall_impl(CUresult res, const char* file, int line)
+        {
+            if (res != CUDA_SUCCESS)
+                cv::error( cv::Exception(CV_GpuApiCallError, cuGetErrString(res), "unknown function", file, line) );
+        }
+    }
+}}
+
+#define cuSafeCall( op ) cv::gpu::detail::cuSafeCall_impl( (op), __FILE__, __LINE__ )
+
+#endif // HAVE_CUDA
+
+#endif // __CU_SAFE_CALL_H__
--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@@ -1,212 +1,212 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/*
-    NV12ToARGB color space conversion CUDA kernel
-
-    This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
-    source and converts to output in ARGB format
-*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu { namespace device {
-    namespace video_decoding
-    {
-        __constant__ uint constAlpha = ((uint)0xff << 24);
-
-        __constant__ float constHueColorSpaceMat[9];
-
-        void loadHueCSC(float hueCSC[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
-        }
-
-        __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
-        {
-            float luma, chromaCb, chromaCr;
-
-            // Prepare for hue adjustment
-            luma     = (float)yuvi[0];
-            chromaCb = (float)((int)yuvi[1] - 512.0f);
-            chromaCr = (float)((int)yuvi[2] - 512.0f);
-
-           // Convert YUV To RGB with hue adjustment
-           *red   = (luma     * constHueColorSpaceMat[0]) +
-                    (chromaCb * constHueColorSpaceMat[1]) +
-                    (chromaCr * constHueColorSpaceMat[2]);
-
-           *green = (luma     * constHueColorSpaceMat[3]) +
-                    (chromaCb * constHueColorSpaceMat[4]) +
-                    (chromaCr * constHueColorSpaceMat[5]);
-
-           *blue  = (luma     * constHueColorSpaceMat[6]) +
-                    (chromaCb * constHueColorSpaceMat[7]) +
-                    (chromaCr * constHueColorSpaceMat[8]);
-        }
-
-        __device__ uint RGBAPACK_10bit(float red, float green, float blue, uint alpha)
-        {
-            uint ARGBpixel = 0;
-
-            // Clamp final 10 bit results
-            red   = ::fmin(::fmax(red,   0.0f), 1023.f);
-            green = ::fmin(::fmax(green, 0.0f), 1023.f);
-            blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
-
-            // Convert to 8 bit unsigned integers per color component
-            ARGBpixel = (((uint)blue  >> 2) |
-                        (((uint)green >> 2) << 8)  |
-                        (((uint)red   >> 2) << 16) |
-                        (uint)alpha);
-
-            return ARGBpixel;
-        }
-
-        // CUDA kernel for outputing the final ARGB output from NV12
-
-        #define COLOR_COMPONENT_BIT_SIZE 10
-        #define COLOR_COMPONENT_MASK     0x3FF
-
-        __global__ void NV12ToARGB(uchar* srcImage, size_t nSourcePitch,
-                                   uint* dstImage, size_t nDestPitch,
-                                   uint width, uint height)
-        {
-            // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-            const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-            const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
-
-            if (x >= width || y >= height)
-                return;
-
-            // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-            // if we move to texture we could read 4 luminance values
-
-            uint yuv101010Pel[2];
-
-            yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
-            yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
-
-            const size_t chromaOffset = nSourcePitch * height;
-
-            const int y_chroma = y >> 1;
-
-            if (y & 1)  // odd scanline ?
-            {
-                uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
-                uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
-
-                if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
-                {
-                    chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
-                    chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
-                }
-
-                yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-                yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-            }
-            else
-            {
-                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-            }
-
-            // this steps performs the color conversion
-            uint yuvi[6];
-            float red[2], green[2], blue[2];
-
-            yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
-            yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-            yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-            yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
-            yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-            yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-            // YUV to RGB Transformation conversion
-            YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
-            YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-            // Clamp the results to RGBA
-
-            const size_t dstImagePitch = nDestPitch >> 2;
-
-            dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
-            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
-        }
-
-        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
-
-            NV12ToARGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
-                interopFrame.cols, interopFrame.rows);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    NV12ToARGB color space conversion CUDA kernel
+
+    This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
+    source and converts to output in ARGB format
+*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device {
+    namespace video_decoding
+    {
+        __constant__ uint constAlpha = ((uint)0xff << 24);
+
+        __constant__ float constHueColorSpaceMat[9];
+
+        void loadHueCSC(float hueCSC[9])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
+        }
+
+        __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
+        {
+            float luma, chromaCb, chromaCr;
+
+            // Prepare for hue adjustment
+            luma     = (float)yuvi[0];
+            chromaCb = (float)((int)yuvi[1] - 512.0f);
+            chromaCr = (float)((int)yuvi[2] - 512.0f);
+
+           // Convert YUV To RGB with hue adjustment
+           *red   = (luma     * constHueColorSpaceMat[0]) +
+                    (chromaCb * constHueColorSpaceMat[1]) +
+                    (chromaCr * constHueColorSpaceMat[2]);
+
+           *green = (luma     * constHueColorSpaceMat[3]) +
+                    (chromaCb * constHueColorSpaceMat[4]) +
+                    (chromaCr * constHueColorSpaceMat[5]);
+
+           *blue  = (luma     * constHueColorSpaceMat[6]) +
+                    (chromaCb * constHueColorSpaceMat[7]) +
+                    (chromaCr * constHueColorSpaceMat[8]);
+        }
+
+        __device__ uint RGBAPACK_10bit(float red, float green, float blue, uint alpha)
+        {
+            uint ARGBpixel = 0;
+
+            // Clamp final 10 bit results
+            red   = ::fmin(::fmax(red,   0.0f), 1023.f);
+            green = ::fmin(::fmax(green, 0.0f), 1023.f);
+            blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
+
+            // Convert to 8 bit unsigned integers per color component
+            ARGBpixel = (((uint)blue  >> 2) |
+                        (((uint)green >> 2) << 8)  |
+                        (((uint)red   >> 2) << 16) |
+                        (uint)alpha);
+
+            return ARGBpixel;
+        }
+
+        // CUDA kernel for outputing the final ARGB output from NV12
+
+        #define COLOR_COMPONENT_BIT_SIZE 10
+        #define COLOR_COMPONENT_MASK     0x3FF
+
+        __global__ void NV12ToARGB(uchar* srcImage, size_t nSourcePitch,
+                                   uint* dstImage, size_t nDestPitch,
+                                   uint width, uint height)
+        {
+            // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+            const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+            const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
+
+            if (x >= width || y >= height)
+                return;
+
+            // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+            // if we move to texture we could read 4 luminance values
+
+            uint yuv101010Pel[2];
+
+            yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
+            yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
+
+            const size_t chromaOffset = nSourcePitch * height;
+
+            const int y_chroma = y >> 1;
+
+            if (y & 1)  // odd scanline ?
+            {
+                uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
+                uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
+
+                if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+                {
+                    chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
+                    chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
+                }
+
+                yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+                yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+            }
+            else
+            {
+                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+            }
+
+            // this steps performs the color conversion
+            uint yuvi[6];
+            float red[2], green[2], blue[2];
+
+            yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
+            yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+            yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+            yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
+            yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+            yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+            // YUV to RGB Transformation conversion
+            YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
+            YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+            // Clamp the results to RGBA
+
+            const size_t dstImagePitch = nDestPitch >> 2;
+
+            dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
+            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
+        }
+
+        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
+
+            NV12ToARGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
+                interopFrame.cols, interopFrame.rows);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -1,472 +1,472 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_distance.hpp"
-#include "opencv2/gpu/device/datamov_utils.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace bf_radius_match
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match Unrolled
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
-            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Dist dist;
-
-            #pragma unroll
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                if (loadX < query.cols)
-                {
-                    T val;
-
-                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                }
-
-                __syncthreads();
-
-                #pragma unroll
-                for (int j = 0; j < BLOCK_SIZE; ++j)
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                __syncthreads();
-            }
-
-            float distVal = (typename Dist::result_type)dist;
-
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-            {
-                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                if (ind < maxCount)
-                {
-                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                    bestDistance.ptr(queryIdx)[ind] = distVal;
-                }
-            }
-
-            #endif
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
-                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            for (int i = 0; i < n; ++i)
-            {
-                const PtrStepSz<T> train = trains[i];
-
-                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-                if (masks != 0 && masks[i].data)
-                {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                else
-                {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match
-
-        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
-            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Dist dist;
-
-            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                if (loadX < query.cols)
-                {
-                    T val;
-
-                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                }
-
-                __syncthreads();
-
-                #pragma unroll
-                for (int j = 0; j < BLOCK_SIZE; ++j)
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                __syncthreads();
-            }
-
-            float distVal = (typename Dist::result_type)dist;
-
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-            {
-                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                if (ind < maxCount)
-                {
-                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                    bestDistance.ptr(queryIdx)[ind] = distVal;
-                }
-            }
-
-            #endif
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
-                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            for (int i = 0; i < n; ++i)
-            {
-                const PtrStepSz<T> train = trains[i];
-
-                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-                if (masks != 0 && masks[i].data)
-                {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                else
-                {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match dispatcher
-
-        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
-        {
-            (void)cc;
-            if (query.cols <= 64)
-            {
-                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-        }
-
-        template <typename Dist, typename T>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
-        {
-            (void)cc;
-            if (query.cols <= 64)
-            {
-                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Radius Match caller
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-            else
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-            else
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-            else
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    } // namespace bf_radius_match
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+#include "opencv2/gpu/device/datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace bf_radius_match
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
+
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+
+            #endif
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
+
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+
+            #endif
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             int cc, cudaStream_t stream)
+        {
+            (void)cc;
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+        }
+
+        template <typename Dist, typename T>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             int cc, cudaStream_t stream)
+        {
+            (void)cc;
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Radius Match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                cc, stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                cc, stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                cc, stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+    } // namespace bf_radius_match
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -1,201 +1,201 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-/// Bilateral filtering
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
-        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
-        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
-        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
-
-        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
-
-        template<typename T, typename B> 
-        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-            
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= src.cols || y >= src.rows)
-                return;
-
-            value_type center = saturate_cast<value_type>(src(y, x));
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0;
-
-            int r = ksz / 2;
-            float r2 = (float)(r * r);
-
-            int tx = x - r + ksz;
-            int ty = y - r + ksz;
-
-            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(src(cy, cx));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            else
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            dst(y, x) = saturate_cast<T>(sum1 / sum2);
-        }
-
-        template<typename T, template <typename> class B>
-        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
-             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
-
-            static caller_t funcs[] = 
-            {
-                bilateral_caller<T, BrdReflect101>,
-                bilateral_caller<T, BrdReplicate>,
-                bilateral_caller<T, BrdConstant>,
-                bilateral_caller<T, BrdReflect>,
-                bilateral_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
-        }
-    }
-}}}
-
-
-#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
-    template void cv::gpu::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
-
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(short)
-//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
-OCV_INSTANTIATE_BILATERAL_FILTER(short3)
-OCV_INSTANTIATE_BILATERAL_FILTER(short4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
-//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(int)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(float)
-//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
-OCV_INSTANTIATE_BILATERAL_FILTER(float3)
-OCV_INSTANTIATE_BILATERAL_FILTER(float4)
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+/// Bilateral filtering
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
+        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
+        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
+        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
+
+        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
+
+        template<typename T, typename B>
+        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            value_type center = saturate_cast<value_type>(src(y, x));
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0;
+
+            int r = ksz / 2;
+            float r2 = (float)(r * r);
+
+            int tx = x - r + ksz;
+            int ty = y - r + ksz;
+
+            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(src(cy, cx));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            else
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            dst(y, x) = saturate_cast<T>(sum1 / sum2);
+        }
+
+        template<typename T, template <typename> class B>
+        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
+             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
+
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdReflect101>,
+                bilateral_caller<T, BrdReplicate>,
+                bilateral_caller<T, BrdConstant>,
+                bilateral_caller<T, BrdReflect>,
+                bilateral_caller<T, BrdWrap>,
+            };
+            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+        }
+    }
+}}}
+
+
+#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
+    template void cv::gpu::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(short)
+//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
+OCV_INSTANTIATE_BILATERAL_FILTER(short3)
+OCV_INSTANTIATE_BILATERAL_FILTER(short4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
+//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(int)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(float)
+//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
+OCV_INSTANTIATE_BILATERAL_FILTER(float3)
+OCV_INSTANTIATE_BILATERAL_FILTER(float4)
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -1,121 +1,121 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace blend
-    {
-        template <typename T>
-        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                int x_ = x / cn;
-                float w1 = weights1.ptr(y)[x_];
-                float w2 = weights2.ptr(y)[x_];
-                T p1 = img1.ptr(y)[x];
-                T p2 = img2.ptr(y)[x];
-                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-            }
-        }
-
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
-
-
-        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                float w1 = weights1.ptr(y)[x];
-                float w2 = weights2.ptr(y)[x];
-                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-                w1 *= sum_inv;
-                w2 *= sum_inv;
-                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-            }
-        }
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace blend
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }
+
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -1,384 +1,384 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <internal_shared.hpp>
-#include <opencv2/gpu/device/transform.hpp>
-#include <opencv2/gpu/device/color.hpp>
-#include <cvt_colot_internal.h>
-
-namespace cv { namespace gpu { namespace device
-{
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
-    { \
-        traits::functor_type functor = traits::create_functor(); \
-        typedef typename traits::functor_type::argument_type src_t; \
-        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
-    }
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
-
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <internal_shared.hpp>
+#include <opencv2/gpu/device/transform.hpp>
+#include <opencv2/gpu/device/color.hpp>
+#include <cvt_colot_internal.h>
+
+namespace cv { namespace gpu { namespace device
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
+    { \
+        traits::functor_type functor = traits::create_functor(); \
+        typedef typename traits::functor_type::argument_type src_t; \
+        typedef typename traits::functor_type::result_type   dst_t; \
+        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -1,388 +1,388 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace column_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 16;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
-            #else
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 2;
-                const int HALO_SIZE = 2;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
-
-            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-
-            if (x >= src.cols)
-                return;
-
-            const T* src_col = src.ptr() + x;
-
-            const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
-
-            if (blockIdx.y > 0)
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            if (blockIdx.y + 2 < gridDim.y)
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int y = yStart + j * BLOCK_DIM_Y;
-
-                if (y < src.rows)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 16;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 2;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-
-            B<T> brd(src.rows);
-
-            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect101>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<10, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<11, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<12, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<13, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<14, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<15, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<16, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<17, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<18, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<19, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<20, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<21, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<22, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<23, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<24, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<25, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<26, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<27, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<28, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<29, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<30, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<31, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<32, T, D, BrdColReplicate>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 2, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 3, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 4, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 5, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 6, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 7, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 8, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 9, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<10, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<11, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<12, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<13, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<14, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<15, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<16, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<17, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<18, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<19, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<20, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<21, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<22, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<23, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<24, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<25, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<26, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<27, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<28, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<29, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<30, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<31, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<32, T, D, BrdColConstant>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 2, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 3, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 4, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 5, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 6, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 7, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 8, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 9, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<10, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<11, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<12, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<13, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<14, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<15, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<17, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<18, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<19, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<20, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<21, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<22, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<23, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<24, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<25, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<26, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<27, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<28, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<29, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<32, T, D, BrdColWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace column_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/static_check.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace column_filter
+    {
+        #define MAX_KERNEL_SIZE 32
+
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
+        {
+            if (stream == 0)
+                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+            else
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+        }
+
+        template <int KSIZE, typename T, typename D, typename B>
+        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+                const int BLOCK_DIM_X = 16;
+                const int BLOCK_DIM_Y = 16;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+            #else
+                const int BLOCK_DIM_X = 16;
+                const int BLOCK_DIM_Y = 8;
+                const int PATCH_PER_BLOCK = 2;
+                const int HALO_SIZE = 2;
+            #endif
+
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+
+            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+
+            if (x >= src.cols)
+                return;
+
+            const T* src_col = src.ptr() + x;
+
+            const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+            if (blockIdx.y > 0)
+            {
+                //Upper halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
+            }
+            else
+            {
+                //Upper halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
+            }
+
+            if (blockIdx.y + 2 < gridDim.y)
+            {
+                //Main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
+
+                //Lower halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
+            }
+            else
+            {
+                //Main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
+
+                //Lower halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+            {
+                const int y = yStart + j * BLOCK_DIM_Y;
+
+                if (y < src.rows)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
+
+                    #pragma unroll
+                    for (int k = 0; k < KSIZE; ++k)
+                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+
+                    dst(y, x) = saturate_cast<D>(sum);
+                }
+            }
+        }
+
+        template <int KSIZE, typename T, typename D, template<typename> class B>
+        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+        {
+            int BLOCK_DIM_X;
+            int BLOCK_DIM_Y;
+            int PATCH_PER_BLOCK;
+
+            if (cc >= 20)
+            {
+                BLOCK_DIM_X = 16;
+                BLOCK_DIM_Y = 16;
+                PATCH_PER_BLOCK = 4;
+            }
+            else
+            {
+                BLOCK_DIM_X = 16;
+                BLOCK_DIM_Y = 8;
+                PATCH_PER_BLOCK = 2;
+            }
+
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
+
+            B<T> brd(src.rows);
+
+            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, typename D>
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+            static const caller_t callers[5][33] =
+            {
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<10, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<11, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<12, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<13, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<14, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<15, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<16, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<17, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<18, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<19, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<20, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<21, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<22, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<23, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<24, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<25, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<26, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<27, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<28, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<29, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<30, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<31, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<32, T, D, BrdColReflect101>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<10, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<11, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<12, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<13, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<14, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<15, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<16, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<17, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<18, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<19, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<20, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<21, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<22, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<23, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<24, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<25, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<26, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<27, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<28, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<29, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<30, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<31, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<32, T, D, BrdColReplicate>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 2, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 3, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 4, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 5, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 6, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 7, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 8, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 9, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<10, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<11, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<12, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<13, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<14, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<15, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<16, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<17, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<18, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<19, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<20, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<21, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<22, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<23, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<24, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<25, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<26, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<27, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<28, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<29, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<30, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<31, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<32, T, D, BrdColConstant>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 2, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 3, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 4, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 5, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 6, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 7, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 8, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 9, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<10, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<11, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<12, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<13, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<14, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<15, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<16, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<17, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<18, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<19, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<20, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<21, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<22, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<23, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<24, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<25, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<26, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<27, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<28, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<29, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<30, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<31, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<32, T, D, BrdColReflect>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 2, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 3, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 4, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 5, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 6, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 7, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 8, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 9, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<10, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<11, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<12, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<13, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<14, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<15, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<17, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<18, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<19, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<20, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<21, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<22, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<23, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<24, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<25, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<26, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<27, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<28, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<29, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<32, T, D, BrdColWrap>
+                }
+            };
+
+            loadKernel(kernel, ksize, stream);
+
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        }
+
+        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+    } // namespace column_filter
+}}} // namespace cv { namespace gpu { namespace device
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -1,131 +1,131 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = src(y - top, x - left);
-        }
-
-        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
-        {
-            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
-                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
-            const T* borderValue, cudaStream_t stream)
-        {
-            typedef typename TypeVec<T, cn>::vec_type vec_type;
-
-            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
-
-            static const caller_t callers[5] =
-            {
-                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
-            };
-
-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
-        }
-
-        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-
-        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-
-        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = src(y - top, x - left);
+        }
+
+        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+        {
+            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
+                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
+
+                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
+            const T* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type vec_type;
+
+            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+
+            static const caller_t callers[5] =
+            {
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
+            };
+
+            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+        }
+
+        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -1,151 +1,151 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
-//
-// The original code was written by Paul Furgale and Chi Hay Tong
-// and later optimized and prepared for integration into OpenCV by Itseez.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/sort.h>
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace gfft
-    {
-        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __device__ uint g_counter = 0;
-
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, uint max_count, int rows, int cols)
-        {
-            #if __CUDA_ARCH__ >= 110
-
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
-            {
-                float val = tex2D(eigTex, j, i);
-
-                if (val > threshold)
-                {
-                    float maxVal = val;
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
-
-                    if (val == maxVal)
-                    {
-                        const uint ind = atomicInc(&g_counter, (uint)(-1));
-
-                        if (ind < max_count)
-                            corners[ind] = make_float2(j, i);
-                    }
-                }
-            }
-
-            #endif // __CUDA_ARCH__ >= 110
-        }
-
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(uint)) );
-
-            bindTexture(&eigTex, eig);
-
-            dim3 block(16, 16);
-            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
-
-            if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
-            else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            uint count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
-
-            return min(count, max_count);
-        }
-
-        class EigGreater
-        {
-        public:
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
-            {
-                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
-            }
-        };
-
-
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
-        {
-            bindTexture(&eigTex, eig);
-
-            thrust::device_ptr<float2> ptr(corners);
-
-            thrust::sort(ptr, ptr + count, EigGreater());
-        }
-    } // namespace optical_flow
-}}}
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
+//
+// The original code was written by Paul Furgale and Chi Hay Tong
+// and later optimized and prepared for integration into OpenCV by Itseez.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/sort.h>
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace gfft
+    {
+        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __device__ uint g_counter = 0;
+
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, uint max_count, int rows, int cols)
+        {
+            #if __CUDA_ARCH__ >= 110
+
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
+            {
+                float val = tex2D(eigTex, j, i);
+
+                if (val > threshold)
+                {
+                    float maxVal = val;
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
+
+                    if (val == maxVal)
+                    {
+                        const uint ind = atomicInc(&g_counter, (uint)(-1));
+
+                        if (ind < max_count)
+                            corners[ind] = make_float2(j, i);
+                    }
+                }
+            }
+
+            #endif // __CUDA_ARCH__ >= 110
+        }
+
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(uint)) );
+
+            bindTexture(&eigTex, eig);
+
+            dim3 block(16, 16);
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
+
+            if (mask.data)
+                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+            else
+                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            uint count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
+
+            return min(count, max_count);
+        }
+
+        class EigGreater
+        {
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
+                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
+            }
+        };
+
+
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        {
+            bindTexture(&eigTex, eig);
+
+            thrust::device_ptr<float2> ptr(corners);
+
+            thrust::sort(ptr, ptr + count, EigGreater());
+        }
+    } // namespace optical_flow
+}}}
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -1,224 +1,224 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    #define UINT_BITS 32U
-
-    //Warps == subhistograms per threadblock
-    #define WARP_COUNT 6
-
-    //Threadblock size
-    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-    #define HISTOGRAM256_BIN_COUNT 256
-
-    //Shared memory per threadblock
-    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-
-    #define PARTIAL_HISTOGRAM256_COUNT 240
-
-    #define MERGE_THREADBLOCK_SIZE 256
-
-    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
-
-    namespace hist
-    {
-        #if (!USE_SMEM_ATOMICS)
-
-            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-
-            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
-            {
-                uint count;
-                do
-                {
-                    count = s_WarpHist[data] & TAG_MASK;
-                    count = threadTag | (count + 1);
-                    s_WarpHist[data] = count;
-                } while (s_WarpHist[data] != count);
-            }
-
-        #else
-
-            #define TAG_MASK 0xFFFFFFFFU
-
-            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
-            {
-                atomicAdd(s_WarpHist + data, 1);
-            }
-
-        #endif
-
-        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-        {
-            uint x = pos_x << 2;
-
-            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-        }
-
-        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-        {
-            //Per-warp subhistogram storage
-            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-
-            //Clear shared memory storage for current threadblock before processing
-            #pragma unroll
-            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-
-            //Cycle through the entire data set, update subhistograms for each warp
-            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
-
-            __syncthreads();
-            const uint colsui = d_Data.step / sizeof(uint);
-            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-            {
-                uint pos_y = pos / colsui;
-                uint pos_x = pos % colsui;
-                uint data = d_Data.ptr(pos_y)[pos_x];
-                addWord(s_WarpHist, data, tag, pos_x, cols);
-            }
-
-            //Merge per-warp histograms into per-block and write to global memory
-            __syncthreads();
-            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-            {
-                uint sum = 0;
-
-                for (uint i = 0; i < WARP_COUNT; i++)
-                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-            }
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////
-        // Merge histogram256() output
-        // Run one threadblock per bin; each threadblock adds up the same bin counter
-        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-        // takes only a fraction of total processing time
-        ////////////////////////////////////////////////////////////////////////////////
-
-        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-        {
-            uint sum = 0;
-
-            #pragma unroll
-            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
-
-            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-            data[threadIdx.x] = sum;
-
-            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-            {
-                __syncthreads();
-                if(threadIdx.x < stride)
-                    data[threadIdx.x] += data[threadIdx.x + stride];
-            }
-
-            if(threadIdx.x == 0)
-                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-        }
-
-        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
-        {
-            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                PtrStepSz<uint>(src),
-                buf,
-                static_cast<uint>(src.rows * src.step / sizeof(uint)),
-                src.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __constant__ int c_lut[256];
-
-        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < src.cols && y < src.rows)
-            {
-                const uchar val = src.ptr(y)[x];
-                const int lut = c_lut[val];
-                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-            }
-        }
-
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-        {
-            dim3 block(16, 16);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-
-            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace hist
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    #define UINT_BITS 32U
+
+    //Warps == subhistograms per threadblock
+    #define WARP_COUNT 6
+
+    //Threadblock size
+    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
+    #define HISTOGRAM256_BIN_COUNT 256
+
+    //Shared memory per threadblock
+    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
+
+    #define PARTIAL_HISTOGRAM256_COUNT 240
+
+    #define MERGE_THREADBLOCK_SIZE 256
+
+    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
+
+    namespace hist
+    {
+        #if (!USE_SMEM_ATOMICS)
+
+            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
+
+            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+            {
+                uint count;
+                do
+                {
+                    count = s_WarpHist[data] & TAG_MASK;
+                    count = threadTag | (count + 1);
+                    s_WarpHist[data] = count;
+                } while (s_WarpHist[data] != count);
+            }
+
+        #else
+
+            #define TAG_MASK 0xFFFFFFFFU
+
+            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+            {
+                atomicAdd(s_WarpHist + data, 1);
+            }
+
+        #endif
+
+        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
+        {
+            uint x = pos_x << 2;
+
+            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
+            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
+            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
+            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+        }
+
+        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+        {
+            //Per-warp subhistogram storage
+            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
+            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+
+            //Clear shared memory storage for current threadblock before processing
+            #pragma unroll
+            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
+               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
+
+            //Cycle through the entire data set, update subhistograms for each warp
+            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
+
+            __syncthreads();
+            const uint colsui = d_Data.step / sizeof(uint);
+            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
+            {
+                uint pos_y = pos / colsui;
+                uint pos_x = pos % colsui;
+                uint data = d_Data.ptr(pos_y)[pos_x];
+                addWord(s_WarpHist, data, tag, pos_x, cols);
+            }
+
+            //Merge per-warp histograms into per-block and write to global memory
+            __syncthreads();
+            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
+            {
+                uint sum = 0;
+
+                for (uint i = 0; i < WARP_COUNT; i++)
+                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
+
+                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
+            }
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////
+        // Merge histogram256() output
+        // Run one threadblock per bin; each threadblock adds up the same bin counter
+        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
+        // takes only a fraction of total processing time
+        ////////////////////////////////////////////////////////////////////////////////
+
+        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+        {
+            uint sum = 0;
+
+            #pragma unroll
+            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
+                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
+
+            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
+            data[threadIdx.x] = sum;
+
+            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
+            {
+                __syncthreads();
+                if(threadIdx.x < stride)
+                    data[threadIdx.x] += data[threadIdx.x + stride];
+            }
+
+            if(threadIdx.x == 0)
+                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+        }
+
+        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
+        {
+            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
+                PtrStepSz<uint>(src),
+                buf,
+                static_cast<uint>(src.rows * src.step / sizeof(uint)),
+                src.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __constant__ int c_lut[256];
+
+        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows)
+            {
+                const uchar val = src.ptr(y)[x];
+                const int lut = c_lut[val];
+                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
+            }
+        }
+
+        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+        {
+            dim3 block(16, 16);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+
+            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace hist
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -1,100 +1,100 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_internal_shared_HPP__
-#define __OPENCV_internal_shared_HPP__
-
-#include <cuda_runtime.h>
-#include <npp.h>
-#include "NPP_staging.hpp"
-#include "opencv2/gpu/devmem2d.hpp"
-#include "safe_call.hpp"
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu
-{
-    enum
-    {
-        BORDER_REFLECT101_GPU = 0,
-        BORDER_REPLICATE_GPU,
-        BORDER_CONSTANT_GPU,
-        BORDER_REFLECT_GPU,
-        BORDER_WRAP_GPU
-    };
-  
-    class NppStreamHandler
-    {
-    public:
-        inline explicit NppStreamHandler(cudaStream_t newStream = 0)
-        {
-            oldStream = nppGetStream();
-            nppSetStream(newStream);
-        }
-
-        inline ~NppStreamHandler()
-        {
-            nppSetStream(oldStream);
-        }
-
-    private:
-        cudaStream_t oldStream;
-    };
-
-    class NppStStreamHandler
-    {
-    public:
-        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
-        {
-            oldStream = nppStSetActiveCUDAstream(newStream);
-        }
-
-        inline ~NppStStreamHandler()
-        {
-            nppStSetActiveCUDAstream(oldStream);
-        }
-
-    private:
-        cudaStream_t oldStream;
-    };
-}}
-
-#endif /* __OPENCV_internal_shared_HPP__ */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_internal_shared_HPP__
+#define __OPENCV_internal_shared_HPP__
+
+#include <cuda_runtime.h>
+#include <npp.h>
+#include "NPP_staging.hpp"
+#include "opencv2/gpu/devmem2d.hpp"
+#include "safe_call.hpp"
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu
+{
+    enum
+    {
+        BORDER_REFLECT101_GPU = 0,
+        BORDER_REPLICATE_GPU,
+        BORDER_CONSTANT_GPU,
+        BORDER_REFLECT_GPU,
+        BORDER_WRAP_GPU
+    };
+
+    class NppStreamHandler
+    {
+    public:
+        inline explicit NppStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(newStream);
+        }
+
+        inline ~NppStreamHandler()
+        {
+            nppSetStream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+
+    class NppStStreamHandler
+    {
+    public:
+        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppStSetActiveCUDAstream(newStream);
+        }
+
+        inline ~NppStStreamHandler()
+        {
+            nppStSetActiveCUDAstream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+}}
+
+#endif /* __OPENCV_internal_shared_HPP__ */
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1,217 +1,217 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace mathfunc
-    {
-        //////////////////////////////////////////////////////////////////////////////////////
-        // Cart <-> Polar
-
-        struct Nothing
-        {
-            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
-            {
-            }
-        };
-        struct Magnitude
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
-            }
-        };
-        struct MagnitudeSqr
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-            }
-        };
-        struct Atan2
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-            {
-                float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0 * CV_PI;
-                dst[y * dst_step + x] = scale * angle;
-            }
-        };
-        template <typename Mag, typename Angle>
-        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
-                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-        {
-	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float x_data = xptr[y * x_step + x];
-                float y_data = yptr[y * y_step + x];
-
-                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-            }
-        }
-
-        struct NonEmptyMag
-        {
-            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
-            {
-                return mag[y * mag_step + x];
-            }
-        };
-        struct EmptyMag
-        {
-            static __device__ __forceinline__ float get(const float*, size_t, int, int)
-            {
-                return 1.0f;
-            }
-        };
-        template <typename Mag>
-        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-        {
-	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float mag_data = Mag::get(mag, mag_step, x, y);
-                float angle_data = angle[y * angle_step + x];
-                float sin_a, cos_a;
-
-                ::sincosf(scale * angle_data, &sin_a, &cos_a);
-
-                xptr[y * x_step + x] = mag_data * cos_a;
-                yptr[y * y_step + x] = mag_data * sin_a;
-            }
-        }
-
-        template <typename Mag, typename Angle>
-        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(x.cols, threads.x);
-            grid.y = divUp(x.rows, threads.y);
-
-            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
-
-            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
-                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2][2][2] =
-            {
-                {
-                    {
-                        cartToPolar_caller<Magnitude, Atan2>,
-                        cartToPolar_caller<Magnitude, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<MagnitudeSqr, Atan2>,
-                        cartToPolar_caller<MagnitudeSqr, Nothing>,
-                    }
-                },
-                {
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>,
-                    }
-                }
-            };
-
-            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-        }
-
-        template <typename Mag>
-        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(mag.cols, threads.x);
-            grid.y = divUp(mag.rows, threads.y);
-
-            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
-
-            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
-                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2] =
-            {
-                polarToCart_caller<NonEmptyMag>,
-                polarToCart_caller<EmptyMag>
-            };
-
-            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-        }
-    } // namespace mathfunc
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace mathfunc
+    {
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar
+
+        struct Nothing
+        {
+            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
+            {
+            }
+        };
+        struct Magnitude
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
+            }
+        };
+        struct MagnitudeSqr
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
+            }
+        };
+        struct Atan2
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+            {
+                float angle = ::atan2f(y_data, x_data);
+                angle += (angle < 0) * 2.0 * CV_PI;
+                dst[y * dst_step + x] = scale * angle;
+            }
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
+                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float x_data = xptr[y * x_step + x];
+                float y_data = yptr[y * y_step + x];
+
+                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
+            }
+        }
+
+        struct NonEmptyMag
+        {
+            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+            {
+                return mag[y * mag_step + x];
+            }
+        };
+        struct EmptyMag
+        {
+            static __device__ __forceinline__ float get(const float*, size_t, int, int)
+            {
+                return 1.0f;
+            }
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float mag_data = Mag::get(mag, mag_step, x, y);
+                float angle_data = angle[y * angle_step + x];
+                float sin_a, cos_a;
+
+                ::sincosf(scale * angle_data, &sin_a, &cos_a);
+
+                xptr[y * x_step + x] = mag_data * cos_a;
+                yptr[y * y_step + x] = mag_data * sin_a;
+            }
+        }
+
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(x.cols, threads.x);
+            grid.y = divUp(x.rows, threads.y);
+
+            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+
+            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
+                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2][2][2] =
+            {
+                {
+                    {
+                        cartToPolar_caller<Magnitude, Atan2>,
+                        cartToPolar_caller<Magnitude, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<MagnitudeSqr, Atan2>,
+                        cartToPolar_caller<MagnitudeSqr, Nothing>,
+                    }
+                },
+                {
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>,
+                    }
+                }
+            };
+
+            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+        }
+
+        template <typename Mag>
+        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(mag.cols, threads.x);
+            grid.y = divUp(mag.rows, threads.y);
+
+            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
+                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2] =
+            {
+                polarToCart_caller<NonEmptyMag>,
+                polarToCart_caller<EmptyMag>
+            };
+
+            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -74,12 +74,12 @@ namespace cv { namespace gpu { namespace device

            const int i = blockDim.y * blockIdx.y + threadIdx.y;
            const int j = blockDim.x * blockIdx.x + threadIdx.x;
-            
+
            if (j >= dst.cols || i >= dst.rows)
                return;
-                                                
+
            int bsize = search_radius + block_radius;
-            int search_window = 2 * search_radius + 1;                        
+            int search_window = 2 * search_radius + 1;
            float minus_search_window2_inv = -1.f/(search_window * search_window);

            value_type sum1 = VecTraits<value_type>::all(0);
@@ -87,16 +87,16 @@ namespace cv { namespace gpu { namespace device

            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
            {
-                for(float y = -search_radius; y <= search_radius; ++y)                
+                for(float y = -search_radius; y <= search_radius; ++y)
                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {                    
+                    {
                        float dist2 = 0;
                        for(float ty = -block_radius; ty <= block_radius; ++ty)
                            for(float tx = -block_radius; tx <= block_radius; ++tx)
                            {
                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
-                                
+
                                dist2 += norm2(av - bv);
                            }

@@ -119,10 +119,10 @@ namespace cv { namespace gpu { namespace device
                            for(float tx = -block_radius; tx <= block_radius; ++tx)
                            {
                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
-                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));                                
+                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
                                dist2 += norm2(av - bv);
                            }
-                        
+
                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);

                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
@@ -144,9 +144,9 @@ namespace cv { namespace gpu { namespace device
            B<T> b(src.rows, src.cols);

            int block_window = 2 * block_radius + 1;
-            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);            
+            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
            float noise_mult = minus_h2_inv/(block_window * block_window);
-            
+
            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
            cudaSafeCall ( cudaGetLastError () );
@@ -160,7 +160,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);

-            static func_t funcs[] = 
+            static func_t funcs[] =
            {
                nlm_caller<T, BrdReflect101>,
                nlm_caller<T, BrdReplicate>,
@@ -183,7 +183,7 @@ namespace cv { namespace gpu { namespace device
 namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
-    {  
+    {
        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@@ -193,7 +193,7 @@ namespace cv { namespace gpu { namespace device
            enum
            {
                CTA_SIZE = 128,
-              
+
                TILE_COLS = 128,
                TILE_ROWS = 32,

@@ -217,7 +217,7 @@ namespace cv { namespace gpu { namespace device

            PtrStep<T> src;
            mutable PtrStepi buffer;
-            
+
            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
            {
                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));

                            dist_sums[index] += dist;
-                            col_sums(tx + block_radius, index) += dist;                            
+                            col_sums(tx + block_radius, index) += dist;
                        }
 #endif

@@ -265,9 +265,9 @@ namespace cv { namespace gpu { namespace device
            }

            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {                          
+            {
                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {                                        
+                {
                    int y = index / search_window;
                    int x = index - y * search_window;

@@ -280,8 +280,8 @@ namespace cv { namespace gpu { namespace device
                    int col_sum = 0;

                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));                  
-                    
+                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
+
                    dist_sums[index] += col_sum - col_sums(first, index);

                    col_sums(first, index) = col_sum;
@@ -298,7 +298,7 @@ namespace cv { namespace gpu { namespace device
                T a_down = src(ay + block_radius, ax);

                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {                    
+                {
                    int y = index / search_window;
                    int x = index - y * search_window;

@@ -309,9 +309,9 @@ namespace cv { namespace gpu { namespace device
                    T b_down = src(by + block_radius, bx);

                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-                  
+
                    dist_sums[index] += col_sum  - col_sums(first, index);
-                    col_sums(first, index) = col_sum;                    
+                    col_sums(first, index) = col_sum;
                    up_col_sums(j, index) = col_sum;
                }
            }
@@ -339,14 +339,14 @@ namespace cv { namespace gpu { namespace device

                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
                }
-                
+
                volatile __shared__ float cta_buffer[CTA_SIZE];
-                
+
                int tid = threadIdx.x;

                cta_buffer[tid] = weights_sum;
                __syncthreads();
-                Block::reduce<CTA_SIZE>(cta_buffer, plus());                                
+                Block::reduce<CTA_SIZE>(cta_buffer, plus());
                weights_sum = cta_buffer[0];

                __syncthreads();
@@ -356,7 +356,7 @@ namespace cv { namespace gpu { namespace device
                {
                    cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
                    __syncthreads();
-                    Block::reduce<CTA_SIZE>(cta_buffer, plus());                                                    
+                    Block::reduce<CTA_SIZE>(cta_buffer, plus());
                    reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];

                    __syncthreads();
@@ -388,7 +388,7 @@ namespace cv { namespace gpu { namespace device

                for (int i = tby; i < tey; ++i)
                    for (int j = tbx; j < tex; ++j)
-                    {           
+                    {
                        __syncthreads();

                        if (j == tbx)
@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace device
                            first = 0;
                        }
                        else
-                        {                            
+                        {
                            if (i == tby)
                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
                            else
@@ -407,7 +407,7 @@ namespace cv { namespace gpu { namespace device
                        }

                        __syncthreads();
-                        
+
                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
                    }
            }
@@ -418,7 +418,7 @@ namespace cv { namespace gpu { namespace device
        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }

        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-        {            
+        {
            typedef FastNonLocalMenas<uchar> FNLM;
            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));

@@ -434,13 +434,13 @@ namespace cv { namespace gpu { namespace device
            FNLM fnlm(search_window, block_window, h);

            fnlm.src = (PtrStepSz<T>)src;
-            fnlm.buffer = buffer;            
+            fnlm.buffer = buffer;

            dim3 block(FNLM::CTA_SIZE, 1);
            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
            int smem = search_window * search_window * sizeof(int);

-           
+
            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
            cudaSafeCall ( cudaGetLastError () );
            if (stream == 0)
@@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace device
        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);

-        
+

        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
        {
@@ -462,10 +462,10 @@ namespace cv { namespace gpu { namespace device
            {
                uchar3 p = lab(y, x);
                ab(y,x) = make_uchar2(p.y, p.z);
-                l(y,x) = p.x;                
+                l(y,x) = p.x;
            }
        }
-        
+
        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
        {
            dim3 b(32, 8);
@@ -485,7 +485,7 @@ namespace cv { namespace gpu { namespace device
            if (x < lab.cols && y < lab.rows)
            {
                uchar2 p = ab(y, x);
-                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);          
+                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
            }
        }

--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -1,220 +1,220 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace optical_flow
-    {
-        #define NEEDLE_MAP_SCALE 16
-        #define NUM_VERTS_PER_ARROW 6
-
-        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {
-            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
-
-            volatile float* u_col_sum = smem;
-            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
-
-            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
-            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
-
-            u_col_sum[threadIdx.x] = 0;
-            v_col_sum[threadIdx.x] = 0;
-
-            #pragma unroll
-            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
-            {
-                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
-                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
-            }
-
-            if (threadIdx.x < 8)
-            {
-                // now add the column sums
-                const uint X = threadIdx.x;
-
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
-                }
-
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
-                }
-
-                if (X | 0xf8 == 0xf8)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
-                }
-
-                if (X == 0)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
-                }
-            }
-
-            if (threadIdx.x == 0)
-            {
-                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
-
-                u_col_sum[0] *= coeff;
-                v_col_sum[0] *= coeff;
-
-                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
-                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
-            }
-        }
-
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
-        {
-            const dim3 block(NEEDLE_MAP_SCALE);
-            const dim3 grid(u_avg.cols, u_avg.rows);
-
-            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            // test - just draw a triangle at each pixel
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-
-            float3 v[NUM_VERTS_PER_ARROW];
-
-            if (x < u_avg.cols && y < u_avg.rows)
-            {
-                const float u_avg_val = u_avg(y, x);
-                const float v_avg_val = v_avg(y, x);
-
-                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
-
-                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
-                r = fmin(14.0f * (r / max_flow), 14.0f);
-
-                v[0].z = 1.0f;
-                v[1].z = 0.7f;
-                v[2].z = 0.7f;
-                v[3].z = 0.7f;
-                v[4].z = 0.7f;
-                v[5].z = 1.0f;
-
-                v[0].x = arrow_x;
-                v[0].y = arrow_y;
-                v[5].x = arrow_x;
-                v[5].y = arrow_y;
-
-                v[2].x = arrow_x + r * ::cosf(theta);
-                v[2].y = arrow_y + r * ::sinf(theta);
-                v[3].x = v[2].x;
-                v[3].y = v[2].y;
-
-                r = ::fmin(r, 2.5f);
-
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
-
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
-
-                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[0].x * xscale;
-                vertex_data[indx++] = v[0].y * yscale;
-                vertex_data[indx++] = v[0].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[1].x * xscale;
-                vertex_data[indx++] = v[1].y * yscale;
-                vertex_data[indx++] = v[1].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[2].x * xscale;
-                vertex_data[indx++] = v[2].y * yscale;
-                vertex_data[indx++] = v[2].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[3].x * xscale;
-                vertex_data[indx++] = v[3].y * yscale;
-                vertex_data[indx++] = v[3].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[4].x * xscale;
-                vertex_data[indx++] = v[4].y * yscale;
-                vertex_data[indx++] = v[4].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[5].x * xscale;
-                vertex_data[indx++] = v[5].y * yscale;
-                vertex_data[indx++] = v[5].z;
-            }
-        }
-
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            const dim3 block(16);
-            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
-
-            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace optical_flow
+    {
+        #define NEEDLE_MAP_SCALE 16
+        #define NUM_VERTS_PER_ARROW 6
+
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        {
+            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
+
+            volatile float* u_col_sum = smem;
+            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
+
+            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
+            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
+
+            u_col_sum[threadIdx.x] = 0;
+            v_col_sum[threadIdx.x] = 0;
+
+            #pragma unroll
+            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
+            {
+                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
+                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
+            }
+
+            if (threadIdx.x < 8)
+            {
+                // now add the column sums
+                const uint X = threadIdx.x;
+
+                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
+                }
+
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
+                }
+
+                if (X | 0xf8 == 0xf8)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
+                }
+
+                if (X == 0)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
+                }
+            }
+
+            if (threadIdx.x == 0)
+            {
+                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
+
+                u_col_sum[0] *= coeff;
+                v_col_sum[0] *= coeff;
+
+                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
+                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
+            }
+        }
+
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
+        {
+            const dim3 block(NEEDLE_MAP_SCALE);
+            const dim3 grid(u_avg.cols, u_avg.rows);
+
+            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            // test - just draw a triangle at each pixel
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+
+            float3 v[NUM_VERTS_PER_ARROW];
+
+            if (x < u_avg.cols && y < u_avg.rows)
+            {
+                const float u_avg_val = u_avg(y, x);
+                const float v_avg_val = v_avg(y, x);
+
+                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
+
+                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
+                r = fmin(14.0f * (r / max_flow), 14.0f);
+
+                v[0].z = 1.0f;
+                v[1].z = 0.7f;
+                v[2].z = 0.7f;
+                v[3].z = 0.7f;
+                v[4].z = 0.7f;
+                v[5].z = 1.0f;
+
+                v[0].x = arrow_x;
+                v[0].y = arrow_y;
+                v[5].x = arrow_x;
+                v[5].y = arrow_y;
+
+                v[2].x = arrow_x + r * ::cosf(theta);
+                v[2].y = arrow_y + r * ::sinf(theta);
+                v[3].x = v[2].x;
+                v[3].y = v[2].y;
+
+                r = ::fmin(r, 2.5f);
+
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
+
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
+
+                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[0].x * xscale;
+                vertex_data[indx++] = v[0].y * yscale;
+                vertex_data[indx++] = v[0].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[1].x * xscale;
+                vertex_data[indx++] = v[1].y * yscale;
+                vertex_data[indx++] = v[1].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[2].x * xscale;
+                vertex_data[indx++] = v[2].y * yscale;
+                vertex_data[indx++] = v[2].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[3].x * xscale;
+                vertex_data[indx++] = v[3].y * yscale;
+                vertex_data[indx++] = v[3].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[4].x * xscale;
+                vertex_data[indx++] = v[4].y * yscale;
+                vertex_data[indx++] = v[4].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[5].x * xscale;
+                vertex_data[indx++] = v[5].y * yscale;
+                vertex_data[indx++] = v[5].z;
+            }
+        }
+
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            const dim3 block(16);
+            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
+
+            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -1,422 +1,422 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
-//
-// The original code was written by Paul Furgale and Chi Hay Tong
-// and later optimized and prepared for integration into OpenCV by Itseez.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/sort.h>
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace orb
-    {
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // cull
-
-        int cull_gpu(int* loc, float* response, int size, int n_points)
-        {
-            thrust::device_ptr<int> loc_ptr(loc);
-            thrust::device_ptr<float> response_ptr(response);
-
-            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
-
-            return n_points;
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // HarrisResponses
-
-        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
-        {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
-
-            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints)
-            {
-                const short2 loc = loc_[ptidx];
-
-                const int r = blockSize / 2;
-                const int x0 = loc.x - r;
-                const int y0 = loc.y - r;
-
-                int a = 0, b = 0, c = 0;
-
-                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
-                {
-                    const int i = ind / blockSize;
-                    const int j = ind % blockSize;
-
-                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
-                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
-                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
-
-                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
-                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
-                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
-
-                    a += Ix * Ix;
-                    b += Iy * Iy;
-                    c += Ix * Iy;
-                }
-
-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
-
-                if (threadIdx.x == 0)
-                {
-                    float scale = (1 << 2) * blockSize * 255.0f;
-                    scale = 1.0f / scale;
-                    const float scale_sq_sq = scale * scale * scale * scale;
-
-                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
-                }
-            }
-        }
-
-        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.y);
-
-            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // IC_Angle
-
-        __constant__ int c_u_max[32];
-
-        void loadUMax(const int* u_max, int count)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
-        }
-
-        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
-        {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
-
-            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints)
-            {
-                int m_01 = 0, m_10 = 0;
-
-                const short2 loc = loc_[ptidx];
-
-                // Treat the center line differently, v=0
-                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
-                    m_10 += u * image(loc.y, loc.x + u);
-
-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
-
-                for (int v = 1; v <= half_k; ++v)
-                {
-                    // Proceed over the two lines
-                    int v_sum = 0;
-                    int m_sum = 0;
-                    const int d = c_u_max[v];
-
-                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
-                    {
-                        int val_plus = image(loc.y + v, loc.x + u);
-                        int val_minus = image(loc.y - v, loc.x + u);
-
-                        v_sum += (val_plus - val_minus);
-                        m_sum += u * (val_plus + val_minus);
-                    }
-
-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
-
-                    m_10 += m_sum;
-                    m_01 += v * v_sum;
-                }
-
-                if (threadIdx.x == 0)
-                {
-                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
-                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
-                    kp_dir *= 180.0f / CV_PI;
-
-                    angle[ptidx] = kp_dir;
-                }
-            }
-        }
-
-        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.y);
-
-            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // computeOrbDescriptor
-
-        template <int WTA_K> struct OrbDescriptor;
-
-        #define GET_VALUE(idx) \
-            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
-                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
-
-        template <> struct OrbDescriptor<2>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 16 * i;
-                pattern_y += 16 * i;
-
-                int t0, t1, val;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
-                val = t0 < t1;
-
-                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
-                val |= (t0 < t1) << 1;
-
-                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
-                val |= (t0 < t1) << 2;
-
-                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
-                val |= (t0 < t1) << 3;
-
-                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
-                val |= (t0 < t1) << 4;
-
-                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
-                val |= (t0 < t1) << 5;
-
-                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
-                val |= (t0 < t1) << 6;
-
-                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
-                val |= (t0 < t1) << 7;
-
-                return val;
-            }
-        };
-
-        template <> struct OrbDescriptor<3>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 12 * i;
-                pattern_y += 12 * i;
-
-                int t0, t1, t2, val;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
-                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
-
-                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
-
-                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
-
-                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
-
-                return val;
-            }
-        };
-
-        template <> struct OrbDescriptor<4>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 16 * i;
-                pattern_y += 16 * i;
-
-                int t0, t1, t2, t3, k, val;
-                int a, b;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
-                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val = k;
-
-                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
-                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 2;
-
-                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
-                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 4;
-
-                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
-                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 6;
-
-                return val;
-            }
-        };
-
-        #undef GET_VALUE
-
-        template <int WTA_K>
-        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
-            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
-        {
-            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints && descidx < dsize)
-            {
-                float angle = angle_[ptidx];
-                angle *= (float)(CV_PI / 180.f);
-
-                float sina, cosa;
-                ::sincosf(angle, &sina, &cosa);
-
-                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
-            }
-        }
-
-        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
-            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(dsize, block.x);
-            grid.y = divUp(npoints, block.y);
-
-            switch (WTA_K)
-            {
-            case 2:
-                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-
-            case 3:
-                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-
-            case 4:
-                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // mergeLocation
-
-        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
-        {
-            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (ptidx < npoints)
-            {
-                short2 loc = loc_[ptidx];
-
-                x[ptidx] = loc.x * scale;
-                y[ptidx] = loc.y * scale;
-            }
-        }
-
-        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
-        {
-            dim3 block(256);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.x);
-
-            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
+//
+// The original code was written by Paul Furgale and Chi Hay Tong
+// and later optimized and prepared for integration into OpenCV by Itseez.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/sort.h>
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace orb
+    {
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // cull
+
+        int cull_gpu(int* loc, float* response, int size, int n_points)
+        {
+            thrust::device_ptr<int> loc_ptr(loc);
+            thrust::device_ptr<float> response_ptr(response);
+
+            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+
+            return n_points;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // HarrisResponses
+
+        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
+        {
+            __shared__ int smem[8 * 32];
+
+            volatile int* srow = smem + threadIdx.y * blockDim.x;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                const short2 loc = loc_[ptidx];
+
+                const int r = blockSize / 2;
+                const int x0 = loc.x - r;
+                const int y0 = loc.y - r;
+
+                int a = 0, b = 0, c = 0;
+
+                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
+                {
+                    const int i = ind / blockSize;
+                    const int j = ind % blockSize;
+
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
+
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
+
+                    a += Ix * Ix;
+                    b += Iy * Iy;
+                    c += Ix * Iy;
+                }
+
+                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+
+                if (threadIdx.x == 0)
+                {
+                    float scale = (1 << 2) * blockSize * 255.0f;
+                    scale = 1.0f / scale;
+                    const float scale_sq_sq = scale * scale * scale * scale;
+
+                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
+                }
+            }
+        }
+
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // IC_Angle
+
+        __constant__ int c_u_max[32];
+
+        void loadUMax(const int* u_max, int count)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
+        }
+
+        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
+        {
+            __shared__ int smem[8 * 32];
+
+            volatile int* srow = smem + threadIdx.y * blockDim.x;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                int m_01 = 0, m_10 = 0;
+
+                const short2 loc = loc_[ptidx];
+
+                // Treat the center line differently, v=0
+                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
+                    m_10 += u * image(loc.y, loc.x + u);
+
+                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+
+                for (int v = 1; v <= half_k; ++v)
+                {
+                    // Proceed over the two lines
+                    int v_sum = 0;
+                    int m_sum = 0;
+                    const int d = c_u_max[v];
+
+                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
+                    {
+                        int val_plus = image(loc.y + v, loc.x + u);
+                        int val_minus = image(loc.y - v, loc.x + u);
+
+                        v_sum += (val_plus - val_minus);
+                        m_sum += u * (val_plus + val_minus);
+                    }
+
+                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+
+                    m_10 += m_sum;
+                    m_01 += v * v_sum;
+                }
+
+                if (threadIdx.x == 0)
+                {
+                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
+                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
+                    kp_dir *= 180.0f / CV_PI;
+
+                    angle[ptidx] = kp_dir;
+                }
+            }
+        }
+
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // computeOrbDescriptor
+
+        template <int WTA_K> struct OrbDescriptor;
+
+        #define GET_VALUE(idx) \
+            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
+                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
+
+        template <> struct OrbDescriptor<2>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                val = t0 < t1;
+
+                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
+                val |= (t0 < t1) << 1;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                val |= (t0 < t1) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
+                val |= (t0 < t1) << 3;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                val |= (t0 < t1) << 4;
+
+                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
+                val |= (t0 < t1) << 5;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                val |= (t0 < t1) << 6;
+
+                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
+                val |= (t0 < t1) << 7;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<3>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 12 * i;
+                pattern_y += 12 * i;
+
+                int t0, t1, t2, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
+                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
+
+                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
+
+                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<4>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, t2, t3, k, val;
+                int a, b;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val = k;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 2;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 4;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 6;
+
+                return val;
+            }
+        };
+
+        #undef GET_VALUE
+
+        template <int WTA_K>
+        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
+        {
+            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints && descidx < dsize)
+            {
+                float angle = angle_[ptidx];
+                angle *= (float)(CV_PI / 180.f);
+
+                float sina, cosa;
+                ::sincosf(angle, &sina, &cosa);
+
+                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
+            }
+        }
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(dsize, block.x);
+            grid.y = divUp(npoints, block.y);
+
+            switch (WTA_K)
+            {
+            case 2:
+                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 3:
+                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 4:
+                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // mergeLocation
+
+        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
+        {
+            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (ptidx < npoints)
+            {
+                short2 loc = loc_[ptidx];
+
+                x[ptidx] = loc.x * scale;
+                y[ptidx] = loc.y * scale;
+            }
+        }
+
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
+        {
+            dim3 block(256);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.x);
+
+            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -1,228 +1,228 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
-
-            __shared__ work_t smem[256 + 4];
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y;
-
-            const int src_y = 2 * y;
-
-            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, x);
-                    sum = sum + 0.25f   * src(src_y - 1, x);
-                    sum = sum + 0.375f  * src(src_y    , x);
-                    sum = sum + 0.25f   * src(src_y + 1, x);
-                    sum = sum + 0.0625f * src(src_y + 2, x);
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, left_x);
-                    sum = sum + 0.25f   * src(src_y - 1, left_x);
-                    sum = sum + 0.375f  * src(src_y    , left_x);
-                    sum = sum + 0.25f   * src(src_y + 1, left_x);
-                    sum = sum + 0.0625f * src(src_y + 2, left_x);
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, right_x);
-                    sum = sum + 0.25f   * src(src_y - 1, right_x);
-                    sum = sum + 0.375f  * src(src_y    , right_x);
-                    sum = sum + 0.25f   * src(src_y + 1, right_x);
-                    sum = sum + 0.0625f * src(src_y + 2, right_x);
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-            else
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-
-            __syncthreads();
-
-            if (threadIdx.x < 128)
-            {
-                const int tid2 = threadIdx.x * 2;
-
-                work_t sum;
-
-                sum =       0.0625f * smem[2 + tid2 - 2];
-                sum = sum + 0.25f   * smem[2 + tid2 - 1];
-                sum = sum + 0.375f  * smem[2 + tid2    ];
-                sum = sum + 0.25f   * smem[2 + tid2 + 1];
-                sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-                if (dst_x < dst_cols)
-                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-            }
-        }
-
-        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-            B<T> b(src.rows, src.cols);
-
-            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
+
+            __shared__ work_t smem[256 + 4];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
+
+            const int src_y = 2 * y;
+
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, x);
+                    sum = sum + 0.25f   * src(src_y - 1, x);
+                    sum = sum + 0.375f  * src(src_y    , x);
+                    sum = sum + 0.25f   * src(src_y + 1, x);
+                    sum = sum + 0.0625f * src(src_y + 2, x);
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, left_x);
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);
+                    sum = sum + 0.375f  * src(src_y    , left_x);
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, right_x);
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);
+                    sum = sum + 0.375f  * src(src_y    , right_x);
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+            else
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                work_t sum;
+
+                sum =       0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -1,196 +1,196 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            __shared__ sum_t s_srcPatch[10][10];
-            __shared__ sum_t s_dstPatch[20][16];
-
-            if (threadIdx.x < 10 && threadIdx.y < 10)
-            {
-                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
-                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
-
-                srcx = ::abs(srcx);
-                srcx = ::min(src.cols - 1, srcx);
-
-                srcy = ::abs(srcy);
-                srcy = ::min(src.rows - 1, srcy);
-
-                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
-            }
-
-            __syncthreads();
-
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
-            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
-            const bool eveny = ((threadIdx.y & 1) == 0);
-            const int tidx = threadIdx.x;
-
-            if (eveny)
-            {
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
-                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
-            }
-
-            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
-
-            if (threadIdx.y < 2)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
-            }
-
-            if (threadIdx.y > 13)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
-            }
-
-            __syncthreads();
-
-            sum = VecTraits<sum_t>::all(0);
-
-            const int tidy = threadIdx.y;
-
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
-            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
-
-            if (x < dst.cols && y < dst.rows)
-                dst(y, x) = saturate_cast<T>(4.0f * sum);
-        }
-
-        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            pyrUp<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            __shared__ sum_t s_srcPatch[10][10];
+            __shared__ sum_t s_dstPatch[20][16];
+
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+            {
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);
+
+                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
+            }
+
+            __syncthreads();
+
+            sum_t sum = VecTraits<sum_t>::all(0);
+
+            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+            const bool eveny = ((threadIdx.y & 1) == 0);
+            const int tidx = threadIdx.x;
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+            }
+
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            sum = VecTraits<sum_t>::all(0);
+
+            const int tidy = threadIdx.y;
+
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+            if (x < dst.cols && y < dst.rows)
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
+        }
+
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -1,274 +1,274 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = mapx.ptr(y)[x];
-                const float ycoo = mapy.ptr(y)[x];
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, int cc) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, int) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
-            {
-                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
-                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
-            }
-        };
-
-        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
-
-            static const caller_t callers[3][5] =
-            {
-                {
-                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
-                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
-        }
-
-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+        }
+
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -1,302 +1,302 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-#include <cfloat>
-#include <opencv2/gpu/device/scan.hpp>
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
-
-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
-            }
-        }
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
-
-                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_resize_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type, srcWhole); \
-                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate< type > brd(src.rows, src.cols); \
-                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
-                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                if (stream == 0)
-                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
-                else
-                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> struct ResizeDispatcher<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                int iscale_x = (int)round(fx);
-                int iscale_y = (int)round(fy);
-
-                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
-                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
-                else
-                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-            PtrStepSzb dst, int interpolation, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
-
-            static const caller_t callers[4] =
-            {
-                ResizeDispatcher<PointFilter, T>::call,
-                ResizeDispatcher<LinearFilter, T>::call,
-                ResizeDispatcher<CubicFilter, T>::call,
-                ResizeDispatcher<AreaFilter, T>::call
-            };
-            // chenge to linear if area interpolation upscaling
-            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
-                interpolation = 1;
-
-            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template<typename T> struct scan_traits{};
-
-        template<> struct scan_traits<uchar>
-        {
-            typedef float scan_line_type;
-        };
-
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+#include <cfloat>
+#include <opencv2/gpu/device/scan.hpp>
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x * fx;
+                const float ycoo = y * fy;
+
+                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                dst(y, x) = saturate_cast<T>(src(y, x));
+            }
+        }
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_resize_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                const int xoff; \
+                const int yoff; \
+                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                if (stream == 0)
+                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
+                else
+                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> struct ResizeDispatcher<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                int iscale_x = (int)round(fx);
+                int iscale_y = (int)round(fy);
+
+                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
+                else
+                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+            PtrStepSzb dst, int interpolation, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
+
+            static const caller_t callers[4] =
+            {
+                ResizeDispatcher<PointFilter, T>::call,
+                ResizeDispatcher<LinearFilter, T>::call,
+                ResizeDispatcher<CubicFilter, T>::call,
+                ResizeDispatcher<AreaFilter, T>::call
+            };
+            // chenge to linear if area interpolation upscaling
+            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+                interpolation = 1;
+
+            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
+                static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template<typename T> struct scan_traits{};
+
+        template<> struct scan_traits<uchar>
+        {
+            typedef float scan_line_type;
+        };
+
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@@ -1,175 +1,175 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace video_encoding
-    {
-        __device__ __forceinline__ void rgbtoy(const uchar b, const uchar g, const uchar r, uchar& y)
-        {
-            y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
-        }
-
-        __device__ __forceinline__ void rgbtoyuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
-        {
-            rgbtoy(b, g, r, y);
-            u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
-            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
-        }
-
-        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-            if (x + 1 >= src.cols || y + 1 >= src.rows)
-                return;
-
-            // get pointers to the data
-            const size_t planeSize = src.rows * dst.step;
-            PtrStepb y_plane(dst.data, dst.step);
-            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
-
-            uchar pix;
-            uchar y_val, u_val, v_val;
-
-            pix = src(y, x);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y, x) = y_val;
-
-            pix = src(y, x + 1);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y, x + 1) = y_val;
-
-            pix = src(y + 1, x);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y + 1, x) = y_val;
-
-            pix = src(y + 1, x + 1);
-            rgbtoyuv(pix, pix, pix, y_val, u_val, v_val);
-            y_plane(y + 1, x + 1) = y_val;
-            u_plane(y / 2, x / 2) = u_val;
-            v_plane(y / 2, x / 2) = v_val;
-        }
-
-        template <typename T>
-        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-            if (x + 1 >= src.cols || y + 1 >= src.rows)
-                return;
-
-            // get pointers to the data
-            const size_t planeSize = src.rows * dst.step;
-            PtrStepb y_plane(dst.data, dst.step);
-            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
-
-            T pix;
-            uchar y_val, u_val, v_val;
-
-            pix = src(y, x);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y, x) = y_val;
-
-            pix = src(y, x + 1);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y, x + 1) = y_val;
-
-            pix = src(y + 1, x);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y + 1, x) = y_val;
-
-            pix = src(y + 1, x + 1);
-            rgbtoyuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
-            y_plane(y + 1, x + 1) = y_val;
-            u_plane(y / 2, x / 2) = u_val;
-            v_plane(y / 2, x / 2) = v_val;
-        }
-
-        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-            Gray_to_YV12<<<grid, block>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <int cn>
-        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
-        {
-            typedef typename TypeVec<uchar, cn>::vec_type src_t;
-
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
-        {
-            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);
-
-            static const func_t funcs[] =
-            {
-                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
-            };
-
-            funcs[cn](src, dst);
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace video_encoding
+    {
+        __device__ __forceinline__ void rgbtoy(const uchar b, const uchar g, const uchar r, uchar& y)
+        {
+            y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
+        }
+
+        __device__ __forceinline__ void rgbtoyuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
+        {
+            rgbtoy(b, g, r, y);
+            u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
+            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
+        }
+
+        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+            if (x + 1 >= src.cols || y + 1 >= src.rows)
+                return;
+
+            // get pointers to the data
+            const size_t planeSize = src.rows * dst.step;
+            PtrStepb y_plane(dst.data, dst.step);
+            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
+            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+
+            uchar pix;
+            uchar y_val, u_val, v_val;
+
+            pix = src(y, x);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y, x) = y_val;
+
+            pix = src(y, x + 1);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y, x + 1) = y_val;
+
+            pix = src(y + 1, x);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y + 1, x) = y_val;
+
+            pix = src(y + 1, x + 1);
+            rgbtoyuv(pix, pix, pix, y_val, u_val, v_val);
+            y_plane(y + 1, x + 1) = y_val;
+            u_plane(y / 2, x / 2) = u_val;
+            v_plane(y / 2, x / 2) = v_val;
+        }
+
+        template <typename T>
+        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+            if (x + 1 >= src.cols || y + 1 >= src.rows)
+                return;
+
+            // get pointers to the data
+            const size_t planeSize = src.rows * dst.step;
+            PtrStepb y_plane(dst.data, dst.step);
+            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
+            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+
+            T pix;
+            uchar y_val, u_val, v_val;
+
+            pix = src(y, x);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y, x) = y_val;
+
+            pix = src(y, x + 1);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y, x + 1) = y_val;
+
+            pix = src(y + 1, x);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y + 1, x) = y_val;
+
+            pix = src(y + 1, x + 1);
+            rgbtoyuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
+            y_plane(y + 1, x + 1) = y_val;
+            u_plane(y / 2, x / 2) = u_val;
+            v_plane(y / 2, x / 2) = v_val;
+        }
+
+        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+            Gray_to_YV12<<<grid, block>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <int cn>
+        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
+        {
+            typedef typename TypeVec<uchar, cn>::vec_type src_t;
+
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
+        {
+            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);
+
+            static const func_t funcs[] =
+            {
+                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
+            };
+
+            funcs[cn](src, dst);
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -1,387 +1,387 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace row_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #else
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 4;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-
-            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-            if (y >= src.rows)
-                return;
-
-            const T* src_row = src.ptr(y);
-
-            const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
-
-            if (blockIdx.x > 0)
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
-            }
-
-            if (blockIdx.x + 2 < gridDim.x)
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int x = xStart + j * BLOCK_DIM_X;
-
-                if (x < src.cols)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 4;
-                PATCH_PER_BLOCK = 4;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
-
-            B<T> brd(src.cols);
-
-            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect101>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<10, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<11, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<12, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<13, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<15, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<16, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<17, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<18, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<19, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<20, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<21, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<22, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<23, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<24, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<25, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<26, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<27, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<28, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<29, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<30, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<31, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<32, T, D, BrdRowReplicate>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 2, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 3, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 4, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 5, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 6, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 7, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 8, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 9, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<10, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<11, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<12, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<15, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<16, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<17, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<18, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<19, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<20, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<21, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<22, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<23, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<24, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<25, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<26, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<27, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<28, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<29, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<30, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<31, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<32, T, D, BrdRowConstant>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 2, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 3, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 4, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 5, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 6, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 7, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 8, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 9, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<10, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<11, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<12, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<15, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<16, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<17, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<18, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<19, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<20, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<21, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<22, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<23, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<24, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<25, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<26, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<27, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<28, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<29, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<32, T, D, BrdRowWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace row_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/static_check.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace row_filter
+    {
+        #define MAX_KERNEL_SIZE 32
+
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
+        {
+            if (stream == 0)
+                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+            else
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+        }
+
+        template <int KSIZE, typename T, typename D, typename B>
+        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+                const int BLOCK_DIM_X = 32;
+                const int BLOCK_DIM_Y = 8;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = 1;
+            #else
+                const int BLOCK_DIM_X = 32;
+                const int BLOCK_DIM_Y = 4;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = 1;
+            #endif
+
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+
+            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+            if (y >= src.rows)
+                return;
+
+            const T* src_row = src.ptr(y);
+
+            const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+            if (blockIdx.x > 0)
+            {
+                //Load left halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
+            }
+            else
+            {
+                //Load left halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
+            }
+
+            if (blockIdx.x + 2 < gridDim.x)
+            {
+                //Load main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
+
+                //Load right halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
+            }
+            else
+            {
+                //Load main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
+
+                //Load right halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+            {
+                const int x = xStart + j * BLOCK_DIM_X;
+
+                if (x < src.cols)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
+
+                    #pragma unroll
+                    for (int k = 0; k < KSIZE; ++k)
+                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+
+                    dst(y, x) = saturate_cast<D>(sum);
+                }
+            }
+        }
+
+        template <int KSIZE, typename T, typename D, template<typename> class B>
+        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+        {
+            int BLOCK_DIM_X;
+            int BLOCK_DIM_Y;
+            int PATCH_PER_BLOCK;
+
+            if (cc >= 20)
+            {
+                BLOCK_DIM_X = 32;
+                BLOCK_DIM_Y = 8;
+                PATCH_PER_BLOCK = 4;
+            }
+            else
+            {
+                BLOCK_DIM_X = 32;
+                BLOCK_DIM_Y = 4;
+                PATCH_PER_BLOCK = 4;
+            }
+
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
+
+            B<T> brd(src.cols);
+
+            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, typename D>
+        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+            static const caller_t callers[5][33] =
+            {
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<10, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<11, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<12, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<13, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<16, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<17, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<18, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<19, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<20, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<21, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<22, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<23, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<24, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<25, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<26, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<27, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<28, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<29, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<30, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<31, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<32, T, D, BrdRowReflect101>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<10, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<11, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<12, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<13, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<15, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<16, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<17, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<18, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<19, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<20, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<21, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<22, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<23, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<24, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<25, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<26, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<27, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<28, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<29, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<30, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<31, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<32, T, D, BrdRowReplicate>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 2, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 3, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 4, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 5, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 6, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 7, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 8, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 9, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<10, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<11, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<12, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<15, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<16, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<17, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<18, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<19, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<20, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<21, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<22, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<23, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<24, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<25, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<26, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<27, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<28, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<29, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<30, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<31, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<32, T, D, BrdRowConstant>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 2, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 3, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 4, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 5, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 6, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 7, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 8, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 9, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<10, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<11, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<12, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<16, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<17, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<18, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<19, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<20, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<21, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<22, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<23, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<24, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<25, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<26, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<27, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<28, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<29, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<30, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<31, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<32, T, D, BrdRowReflect>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 2, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 3, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 4, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 5, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 6, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 7, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 8, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 9, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<10, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<11, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<12, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<15, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<16, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<17, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<18, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<19, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<20, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<21, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<22, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<23, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<24, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<25, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<26, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<27, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<28, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<29, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<32, T, D, BrdRowWrap>
+                }
+            };
+
+            loadKernel(kernel, ksize, stream);
+
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        }
+
+        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+    } // namespace row_filter
+}}} // namespace cv { namespace gpu { namespace device
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -1,95 +1,95 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
-#define __OPENCV_CUDA_SAFE_CALL_HPP__
-
-#include <cuda_runtime_api.h>
-#include <cufft.h>
-#include <cublas.h>
-#include "NCV.hpp"
-
-#if defined(__GNUC__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
-    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-namespace cv { namespace gpu
-{
-    void nppError(int err, const char *file, const int line, const char *func = "");
-    void ncvError(int err, const char *file, const int line, const char *func = "");
-    void cufftError(int err, const char *file, const int line, const char *func = "");
-    void cublasError(int err, const char *file, const int line, const char *func = "");
-}}
-
-static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (err < 0)
-        cv::gpu::nppError(err, file, line, func);
-}
-
-static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (NCV_SUCCESS != err)
-        cv::gpu::ncvError(err, file, line, func);
-}
-
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}
-
-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
-
-#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
+#define __OPENCV_CUDA_SAFE_CALL_HPP__
+
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <cublas.h>
+#include "NCV.hpp"
+
+#if defined(__GNUC__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+namespace cv { namespace gpu
+{
+    void nppError(int err, const char *file, const int line, const char *func = "");
+    void ncvError(int err, const char *file, const int line, const char *func = "");
+    void cufftError(int err, const char *file, const int line, const char *func = "");
+    void cublasError(int err, const char *file, const int line, const char *func = "");
+}}
+
+static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+        cv::gpu::nppError(err, file, line, func);
+}
+
+static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (NCV_SUCCESS != err)
+        cv::gpu::ncvError(err, file, line, func);
+}
+
+static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUFFT_SUCCESS != err)
+        cv::gpu::cufftError(err, file, line, func);
+}
+
+static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUBLAS_STATUS_SUCCESS != err)
+        cv::gpu::cublasError(err, file, line, func);
+}
+
+#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -56,14 +56,14 @@ namespace cv
      template<class T, enum cudaTextureReadMode readMode>
      TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
      {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
      }
-      
+
      template<class T, enum cudaTextureReadMode readMode>
      TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
      {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
      }

--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -1,389 +1,389 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        __constant__ float c_warpMat[3 * 3];
-
-        struct AffineTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        struct PerspectiveTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-
-                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////
-        // Build Maps
-
-        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < xmap.cols && y < xmap.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                xmap(y, x) = coord.x;
-                ymap(y, x) = coord.y;
-            }
-        }
-
-        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-
-            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
-        }
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////////
-        // Warp
-
-        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
-            }
-        }
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
-            {
-                (void)xoff;
-                (void)yoff;
-                (void)srcWhole;
-
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
-            {
-                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
-                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
-            }
-        };
-
-        template <class Transform, typename T>
-        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
-
-            static const func_t funcs[3][5] =
-            {
-                {
-                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
-        }
-
-        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
-        }
-
-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
-        }
-
-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        __constant__ float c_warpMat[3 * 3];
+
+        struct AffineTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        struct PerspectiveTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////
+        // Build Maps
+
+        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < xmap.cols && y < xmap.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                xmap(y, x) = coord.x;
+                ymap(y, x) = coord.y;
+            }
+        }
+
+        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
+
+            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
+        }
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Warp
+
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
+            }
+        }
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
+            {
+                (void)xoff;
+                (void)yoff;
+                (void)srcWhole;
+
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
+            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_warp_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
+                else
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <class Transform, typename T>
+        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
+                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+        }
+
+        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+        }
+
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+        }
+
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -1,253 +1,253 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if defined HAVE_CUDA
-
-struct Stream::Impl
-{
-    static cudaStream_t getStream(const Impl* impl) { return impl ? impl->stream : 0; }
-    cudaStream_t stream;
-    int ref_counter;
-};
-
-#include "opencv2/gpu/stream_accessor.hpp"
-
-CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
-{
-    return Stream::Impl::getStream(stream.impl);
-};
-
-#endif /* !defined (HAVE_CUDA) */
-
-
-#if !defined (HAVE_CUDA)
-
-void cv::gpu::Stream::create() { throw_nogpu(); }
-void cv::gpu::Stream::release() { throw_nogpu(); }
-cv::gpu::Stream::Stream() : impl(0) { throw_nogpu(); }
-cv::gpu::Stream::~Stream() { throw_nogpu(); }
-cv::gpu::Stream::Stream(const Stream& /*stream*/) { throw_nogpu(); }
-Stream& cv::gpu::Stream::operator=(const Stream& /*stream*/) { throw_nogpu(); return *this; }
-bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return true; }
-void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
-void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
-Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
-cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream);
-    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
-    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-}}
-
-namespace
-{
-    template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
-    {
-        dst.create(src.size(), src.type());
-        size_t bwidth = src.cols * src.elemSize();
-        cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
-    };
-}
-
-void cv::gpu::Stream::create()
-{
-    if (impl)
-        release();
-
-    cudaStream_t stream;
-    cudaSafeCall( cudaStreamCreate( &stream ) );
-
-    impl = (Stream::Impl*)fastMalloc(sizeof(Stream::Impl));
-
-    impl->stream = stream;
-    impl->ref_counter = 1;
-}
-
-void cv::gpu::Stream::release()
-{
-    if( impl && CV_XADD(&impl->ref_counter, -1) == 1 )
-    {
-        cudaSafeCall( cudaStreamDestroy( impl->stream ) );
-        cv::fastFree( impl );
-    }
-}
-
-cv::gpu::Stream::Stream() : impl(0) { create(); }
-cv::gpu::Stream::~Stream() { release(); }
-
-cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
-{
-    if( impl )
-        CV_XADD(&impl->ref_counter, 1);
-}
-Stream& cv::gpu::Stream::operator=(const Stream& stream)
-{
-    if( this != &stream )
-    {
-        if( stream.impl )
-            CV_XADD(&stream.impl->ref_counter, 1);
-
-        release();
-        impl = stream.impl;
-    }
-    return *this;
-}
-
-bool cv::gpu::Stream::queryIfComplete()
-{
-    cudaError_t err = cudaStreamQuery( Impl::getStream(impl) );
-
-    if (err == cudaErrorNotReady || err == cudaSuccess)
-        return err == cudaSuccess;
-
-    cudaSafeCall(err);
-    return false;
-}
-
-void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( Impl::getStream(impl) ) ); }
-
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
-{
-    // if not -> allocation will be done, but after that dst will not point to page locked memory
-    CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() );
-    devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost);
-}
-void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost); }
-
-void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
-void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToDevice); }
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
-{
-    CV_Assert((src.depth() != CV_64F) || 
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-    {
-        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
-        return;
-    }
-    if (src.depth() == CV_8U)
-    {
-        int cn = src.channels();
-
-        if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-        {
-            int val = saturate_cast<uchar>(s[0]);
-            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
-            return;
-        }
-    }
-
-    setTo(src, s, Impl::getStream(impl));
-}
-
-void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
-{
-    CV_Assert((src.depth() != CV_64F) || 
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    CV_Assert(mask.type() == CV_8UC1);
-
-    setTo(src, val, mask, Impl::getStream(impl));
-}
-
-void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
-{
-    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
-        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
-
-    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
-
-    if( rtype < 0 )
-        rtype = src.type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels());
-
-    int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
-    if( sdepth == ddepth && noScale )
-    {
-        src.copyTo(dst);
-        return;
-    }
-
-    GpuMat temp;
-    const GpuMat* psrc = &src;
-    if( sdepth != ddepth && psrc == &dst )
-        psrc = &(temp = src);
-
-    dst.create( src.size(), rtype );
-    convertTo(src, dst, alpha, beta, Impl::getStream(impl));
-}
-
-cv::gpu::Stream::operator bool() const
-{
-    return impl && impl->stream;
-}
-
-cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
-
-cv::gpu::Stream& cv::gpu::Stream::Null()
-{
-    static Stream s((Impl*)0);
-    return s;
-}
-
-#endif /* !defined (HAVE_CUDA) */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+
+#if defined HAVE_CUDA
+
+struct Stream::Impl
+{
+    static cudaStream_t getStream(const Impl* impl) { return impl ? impl->stream : 0; }
+    cudaStream_t stream;
+    int ref_counter;
+};
+
+#include "opencv2/gpu/stream_accessor.hpp"
+
+CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream)
+{
+    return Stream::Impl::getStream(stream.impl);
+};
+
+#endif /* !defined (HAVE_CUDA) */
+
+
+#if !defined (HAVE_CUDA)
+
+void cv::gpu::Stream::create() { throw_nogpu(); }
+void cv::gpu::Stream::release() { throw_nogpu(); }
+cv::gpu::Stream::Stream() : impl(0) { throw_nogpu(); }
+cv::gpu::Stream::~Stream() { throw_nogpu(); }
+cv::gpu::Stream::Stream(const Stream& /*stream*/) { throw_nogpu(); }
+Stream& cv::gpu::Stream::operator=(const Stream& /*stream*/) { throw_nogpu(); return *this; }
+bool cv::gpu::Stream::queryIfComplete() { throw_nogpu(); return true; }
+void cv::gpu::Stream::waitForCompletion() { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, Mat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const CudaMem& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueUpload(const Mat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat& /*src*/, GpuMat& /*dst*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueMemSet(GpuMat& /*src*/, Scalar /*val*/, const GpuMat& /*mask*/) { throw_nogpu(); }
+void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int /*type*/, double /*a*/, double /*b*/) { throw_nogpu(); }
+Stream& cv::gpu::Stream::Null() { throw_nogpu(); static Stream s; return s; }
+cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace gpu
+{
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
+}}
+
+namespace
+{
+    template<class S, class D> void devcopy(const S& src, D& dst, cudaStream_t s, cudaMemcpyKind k)
+    {
+        dst.create(src.size(), src.type());
+        size_t bwidth = src.cols * src.elemSize();
+        cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
+    };
+}
+
+void cv::gpu::Stream::create()
+{
+    if (impl)
+        release();
+
+    cudaStream_t stream;
+    cudaSafeCall( cudaStreamCreate( &stream ) );
+
+    impl = (Stream::Impl*)fastMalloc(sizeof(Stream::Impl));
+
+    impl->stream = stream;
+    impl->ref_counter = 1;
+}
+
+void cv::gpu::Stream::release()
+{
+    if( impl && CV_XADD(&impl->ref_counter, -1) == 1 )
+    {
+        cudaSafeCall( cudaStreamDestroy( impl->stream ) );
+        cv::fastFree( impl );
+    }
+}
+
+cv::gpu::Stream::Stream() : impl(0) { create(); }
+cv::gpu::Stream::~Stream() { release(); }
+
+cv::gpu::Stream::Stream(const Stream& stream) : impl(stream.impl)
+{
+    if( impl )
+        CV_XADD(&impl->ref_counter, 1);
+}
+Stream& cv::gpu::Stream::operator=(const Stream& stream)
+{
+    if( this != &stream )
+    {
+        if( stream.impl )
+            CV_XADD(&stream.impl->ref_counter, 1);
+
+        release();
+        impl = stream.impl;
+    }
+    return *this;
+}
+
+bool cv::gpu::Stream::queryIfComplete()
+{
+    cudaError_t err = cudaStreamQuery( Impl::getStream(impl) );
+
+    if (err == cudaErrorNotReady || err == cudaSuccess)
+        return err == cudaSuccess;
+
+    cudaSafeCall(err);
+    return false;
+}
+
+void cv::gpu::Stream::waitForCompletion() { cudaSafeCall( cudaStreamSynchronize( Impl::getStream(impl) ) ); }
+
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, Mat& dst)
+{
+    // if not -> allocation will be done, but after that dst will not point to page locked memory
+    CV_Assert(src.cols == dst.cols && src.rows == dst.rows && src.type() == dst.type() );
+    devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost);
+}
+void cv::gpu::Stream::enqueueDownload(const GpuMat& src, CudaMem& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToHost); }
+
+void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueUpload(const Mat& src, GpuMat& dst)  { devcopy(src, dst, Impl::getStream(impl),   cudaMemcpyHostToDevice); }
+void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src, dst, Impl::getStream(impl), cudaMemcpyDeviceToDevice); }
+
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
+{
+    CV_Assert((src.depth() != CV_64F) ||
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
+    if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+    {
+        cudaSafeCall( cudaMemset2DAsync(src.data, src.step, 0, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
+        return;
+    }
+    if (src.depth() == CV_8U)
+    {
+        int cn = src.channels();
+
+        if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+        {
+            int val = saturate_cast<uchar>(s[0]);
+            cudaSafeCall( cudaMemset2DAsync(src.data, src.step, val, src.cols * src.elemSize(), src.rows, Impl::getStream(impl)) );
+            return;
+        }
+    }
+
+    setTo(src, s, Impl::getStream(impl));
+}
+
+void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
+{
+    CV_Assert((src.depth() != CV_64F) ||
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
+    CV_Assert(mask.type() == CV_8UC1);
+
+    setTo(src, val, mask, Impl::getStream(impl));
+}
+
+void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
+{
+    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) ||
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
+    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();
+
+    if( rtype < 0 )
+        rtype = src.type();
+    else
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), src.channels());
+
+    int sdepth = src.depth(), ddepth = CV_MAT_DEPTH(rtype);
+    if( sdepth == ddepth && noScale )
+    {
+        src.copyTo(dst);
+        return;
+    }
+
+    GpuMat temp;
+    const GpuMat* psrc = &src;
+    if( sdepth != ddepth && psrc == &dst )
+        psrc = &(temp = src);
+
+    dst.create( src.size(), rtype );
+    convertTo(src, dst, alpha, beta, Impl::getStream(impl));
+}
+
+cv::gpu::Stream::operator bool() const
+{
+    return impl && impl->stream;
+}
+
+cv::gpu::Stream::Stream(Impl* impl_) : impl(impl_) {}
+
+cv::gpu::Stream& cv::gpu::Stream::Null()
+{
+    static Stream s((Impl*)0);
+    return s;
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/cuvid_video_source.cpp
+++ b/modules/gpu/src/cuvid_video_source.cpp
@@ -1,63 +1,63 @@
-#include "cuvid_video_source.h"
-#include "cu_safe_call.h"
-
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
-
-cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const std::string& fname)
-{
-    CUVIDSOURCEPARAMS params;
-    std::memset(&params, 0, sizeof(CUVIDSOURCEPARAMS));
-
-    // Fill parameter struct
-    params.pUserData = this;                        // will be passed to data handlers
-    params.pfnVideoDataHandler = HandleVideoData;   // our local video-handler callback
-    params.pfnAudioDataHandler = 0;
-
-    // now create the actual source
-    CUresult res = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
-    if (res == CUDA_ERROR_INVALID_SOURCE)
-        throw std::runtime_error("Unsupported video source");
-    cuSafeCall( res );
-
-    CUVIDEOFORMAT vidfmt;
-    cuSafeCall( cuvidGetSourceVideoFormat(videoSource_, &vidfmt, 0) );
-
-    format_.codec = static_cast<VideoReader_GPU::Codec>(vidfmt.codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(vidfmt.chroma_format);
-    format_.width = vidfmt.coded_width;
-    format_.height = vidfmt.coded_height;
-}
-
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::CuvidVideoSource::format() const
-{ 
-    return format_;
-}
-
-void cv::gpu::detail::CuvidVideoSource::start()
-{
-    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Started) );
-}
-
-void cv::gpu::detail::CuvidVideoSource::stop()
-{
-    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Stopped) );
-}
-
-bool cv::gpu::detail::CuvidVideoSource::isStarted() const
-{
-    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Started);
-}
-
-bool cv::gpu::detail::CuvidVideoSource::hasError() const
-{
-    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Error);
-}
-
-int CUDAAPI cv::gpu::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
-{
-    CuvidVideoSource* thiz = static_cast<CuvidVideoSource*>(userData);
-
-    return thiz->parseVideoData(packet->payload, packet->payload_size, (packet->flags & CUVID_PKT_ENDOFSTREAM) != 0);
-}
-
-#endif // defined(HAVE_CUDA) && !defined(__APPLE__)
+#include "cuvid_video_source.h"
+#include "cu_safe_call.h"
+
+#if defined(HAVE_CUDA) && !defined(__APPLE__)
+
+cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const std::string& fname)
+{
+    CUVIDSOURCEPARAMS params;
+    std::memset(&params, 0, sizeof(CUVIDSOURCEPARAMS));
+
+    // Fill parameter struct
+    params.pUserData = this;                        // will be passed to data handlers
+    params.pfnVideoDataHandler = HandleVideoData;   // our local video-handler callback
+    params.pfnAudioDataHandler = 0;
+
+    // now create the actual source
+    CUresult res = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
+    if (res == CUDA_ERROR_INVALID_SOURCE)
+        throw std::runtime_error("Unsupported video source");
+    cuSafeCall( res );
+
+    CUVIDEOFORMAT vidfmt;
+    cuSafeCall( cuvidGetSourceVideoFormat(videoSource_, &vidfmt, 0) );
+
+    format_.codec = static_cast<VideoReader_GPU::Codec>(vidfmt.codec);
+    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(vidfmt.chroma_format);
+    format_.width = vidfmt.coded_width;
+    format_.height = vidfmt.coded_height;
+}
+
+cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::CuvidVideoSource::format() const
+{
+    return format_;
+}
+
+void cv::gpu::detail::CuvidVideoSource::start()
+{
+    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Started) );
+}
+
+void cv::gpu::detail::CuvidVideoSource::stop()
+{
+    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Stopped) );
+}
+
+bool cv::gpu::detail::CuvidVideoSource::isStarted() const
+{
+    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Started);
+}
+
+bool cv::gpu::detail::CuvidVideoSource::hasError() const
+{
+    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Error);
+}
+
+int CUDAAPI cv::gpu::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
+{
+    CuvidVideoSource* thiz = static_cast<CuvidVideoSource*>(userData);
+
+    return thiz->parseVideoData(packet->payload, packet->payload_size, (packet->flags & CUVID_PKT_ENDOFSTREAM) != 0);
+}
+
+#endif // defined(HAVE_CUDA) && !defined(__APPLE__)
--- a/modules/gpu/src/cuvid_video_source.h
+++ b/modules/gpu/src/cuvid_video_source.h
@@ -1,90 +1,90 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __CUVUD_VIDEO_SOURCE_H__
-#define __CUVUD_VIDEO_SOURCE_H__
-
-#include "precomp.hpp"
-
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
-
-namespace cv { namespace gpu 
-{
-    namespace detail
-    {
-        class CuvidVideoSource : public VideoReader_GPU::VideoSource
-        {
-        public:
-            explicit CuvidVideoSource(const std::string& fname);
-            ~CuvidVideoSource() { cuvidDestroyVideoSource(videoSource_); }
-
-            VideoReader_GPU::FormatInfo format() const;
-            void start();
-            void stop();
-            bool isStarted() const;
-            bool hasError() const;
-
-        private:
-            CuvidVideoSource(const CuvidVideoSource&);
-            CuvidVideoSource& operator =(const CuvidVideoSource&);
-
-            // Callback for handling packages of demuxed video data.
-            //
-            // Parameters:
-            //      pUserData - Pointer to user data. We must pass a pointer to a 
-            //          VideoSourceData struct here, that contains a valid CUvideoparser
-            //          and FrameQueue.
-            //      pPacket - video-source data packet. 
-            //
-            // NOTE: called from a different thread that doesn't not have a cuda context
-            //
-            static int CUDAAPI HandleVideoData(void* pUserData, CUVIDSOURCEDATAPACKET* pPacket);
-
-            CUvideosource videoSource_;
-            VideoReader_GPU::FormatInfo format_;
-        };
-    }
-}}
-
-#endif // defined(HAVE_CUDA) && !defined(__APPLE__)
-
-#endif // __CUVUD_VIDEO_SOURCE_H__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __CUVUD_VIDEO_SOURCE_H__
+#define __CUVUD_VIDEO_SOURCE_H__
+
+#include "precomp.hpp"
+
+#if defined(HAVE_CUDA) && !defined(__APPLE__)
+
+namespace cv { namespace gpu
+{
+    namespace detail
+    {
+        class CuvidVideoSource : public VideoReader_GPU::VideoSource
+        {
+        public:
+            explicit CuvidVideoSource(const std::string& fname);
+            ~CuvidVideoSource() { cuvidDestroyVideoSource(videoSource_); }
+
+            VideoReader_GPU::FormatInfo format() const;
+            void start();
+            void stop();
+            bool isStarted() const;
+            bool hasError() const;
+
+        private:
+            CuvidVideoSource(const CuvidVideoSource&);
+            CuvidVideoSource& operator =(const CuvidVideoSource&);
+
+            // Callback for handling packages of demuxed video data.
+            //
+            // Parameters:
+            //      pUserData - Pointer to user data. We must pass a pointer to a
+            //          VideoSourceData struct here, that contains a valid CUvideoparser
+            //          and FrameQueue.
+            //      pPacket - video-source data packet.
+            //
+            // NOTE: called from a different thread that doesn't not have a cuda context
+            //
+            static int CUDAAPI HandleVideoData(void* pUserData, CUVIDSOURCEDATAPACKET* pPacket);
+
+            CUvideosource videoSource_;
+            VideoReader_GPU::FormatInfo format_;
+        };
+    }
+}}
+
+#endif // defined(HAVE_CUDA) && !defined(__APPLE__)
+
+#endif // __CUVUD_VIDEO_SOURCE_H__
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@@ -124,7 +124,7 @@ void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_

    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-    
+
    dst.create(src.size(), src.type());
    func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s));
 }
@@ -143,7 +143,7 @@ namespace cv { namespace gpu { namespace device
        template<typename T>
        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
                          int search_window, int block_window, float h, cudaStream_t stream);
-        
+
        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream);
        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream);
     }
@@ -152,30 +152,30 @@ namespace cv { namespace gpu { namespace device
 void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
 {
    CV_Assert(src.depth() == CV_8U && src.channels() < 4);
-            
-    int border_size = search_window/2 + block_window/2;    
+
+    int border_size = search_window/2 + block_window/2;
    Size esize = src.size() + Size(border_size, border_size) * 2;

-    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);    
+    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);

    cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
-    
+
    int bcols, brows;
    device::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
    buffer.create(brows, bcols, CV_32S);

    using namespace cv::gpu::device::imgproc;
    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};            
-    
+    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
+
    dst.create(src.size(), src.type());
    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
 }

 void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
-{   
+{
 #if (CUDA_VERSION < 5000)
    (void)src;
    (void)dst;
@@ -183,14 +183,14 @@ void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat&
    (void)h_color;
    (void)search_window;
    (void)block_window;
-    (void)s;            
-            
+    (void)s;
+
    CV_Error( CV_GpuApiCallError, "Lab method required CUDA 5.0 and higher" );
 #else


    CV_Assert(src.type() == CV_8UC3);
-    
+
    lab.create(src.size(), src.type());
    cv::gpu::cvtColor(src, lab, CV_BGR2Lab, 0, s);

@@ -201,7 +201,7 @@ void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat&
    l.create(src.size(), CV_8U);
    ab.create(src.size(), CV_8UC2);
    device::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
-        
+
    simpleMethod(l, l, h_luminance, search_window, block_window, s);
    simpleMethod(ab, ab, h_color, search_window, block_window, s);

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -1,249 +1,249 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace std;
-
-#ifdef HAVE_CUDA
-
-namespace 
-{
-    #define error_entry(entry)  { entry, #entry }
-
-    struct ErrorEntry
-    {
-        int code;
-        string str;
-    }; 
-
-    struct ErrorEntryComparer
-    {
-        int code;
-        ErrorEntryComparer(int code_) : code(code_) {};
-        bool operator()(const ErrorEntry& e) const { return e.code == code; }
-    };
-
-    string getErrorString(int code, const ErrorEntry* errors, size_t n)
-    {
-        size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
-
-        const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");
-
-        ostringstream ostr;
-        ostr << msg << " [Code = " << code << "]";
-
-        return ostr.str();
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // NPP errors
-    
-    const ErrorEntry npp_errors [] = 
-    {
-        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
-        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
-        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
-
-#if defined (_MSC_VER)
-        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
-#endif
-
-        error_entry( NPP_BAD_ARG_ERROR ),
-        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_TEXTURE_BIND_ERROR ),
-        error_entry( NPP_COEFF_ERROR ),
-        error_entry( NPP_RECT_ERROR ),
-        error_entry( NPP_QUAD_ERROR ),
-        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
-        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
-        error_entry( NPP_INTERPOLATION_ERROR ),
-        error_entry( NPP_RESIZE_FACTOR_ERROR ),
-        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
-        error_entry( NPP_MEMFREE_ERR ),
-        error_entry( NPP_MEMSET_ERR ),
-        error_entry( NPP_MEMCPY_ERROR ),
-        error_entry( NPP_MEM_ALLOC_ERR ),
-        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
-        error_entry( NPP_MIRROR_FLIP_ERR ),
-        error_entry( NPP_INVALID_INPUT ),
-        error_entry( NPP_ALIGNMENT_ERROR ),
-        error_entry( NPP_STEP_ERROR ),
-        error_entry( NPP_SIZE_ERROR ),
-        error_entry( NPP_POINTER_ERROR ),
-        error_entry( NPP_NULL_POINTER_ERROR ),
-        error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
-        error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
-        error_entry( NPP_ERROR ),
-        error_entry( NPP_NO_ERROR ),
-        error_entry( NPP_SUCCESS ),
-        error_entry( NPP_WARNING ),
-        error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
-        error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
-        error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
-        error_entry( NPP_DOUBLE_SIZE_WARNING ),
-        error_entry( NPP_ODD_ROI_WARNING )
-    };
-
-    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // NCV errors
-    
-    const ErrorEntry ncv_errors [] = 
-    {
-        error_entry( NCV_SUCCESS ),
-        error_entry( NCV_UNKNOWN_ERROR ),
-        error_entry( NCV_CUDA_ERROR ),
-        error_entry( NCV_NPP_ERROR ),
-        error_entry( NCV_FILE_ERROR ),
-        error_entry( NCV_NULL_PTR ),
-        error_entry( NCV_INCONSISTENT_INPUT ),
-        error_entry( NCV_TEXTURE_BIND_ERROR ),
-        error_entry( NCV_DIMENSIONS_INVALID ),
-        error_entry( NCV_INVALID_ROI ),
-        error_entry( NCV_INVALID_STEP ),
-        error_entry( NCV_INVALID_SCALE ),
-        error_entry( NCV_INVALID_SCALE ),
-        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
-        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
-        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
-        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
-        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
-        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
-        error_entry( NCV_MEM_COPY_ERROR ),
-        error_entry( NCV_MEM_RESIDENCE_ERROR ),
-        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
-        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
-        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
-        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
-        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
-        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
-        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
-        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
-        error_entry( NPPST_SUCCESS ),
-        error_entry( NPPST_ERROR ),
-        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
-        error_entry( NPPST_NULL_POINTER_ERROR ),
-        error_entry( NPPST_TEXTURE_BIND_ERROR ),
-        error_entry( NPPST_MEMCPY_ERROR ),
-        error_entry( NPPST_MEM_ALLOC_ERR ),
-        error_entry( NPPST_MEMFREE_ERR ),
-        error_entry( NPPST_INVALID_ROI ),
-        error_entry( NPPST_INVALID_STEP ),
-        error_entry( NPPST_INVALID_SCALE ),
-        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
-        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
-        error_entry( NPPST_MEM_INTERNAL_ERROR )
-    };
-
-    const size_t ncv_error_num = sizeof(ncv_errors) / sizeof(ncv_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUFFT errors
-
-    const ErrorEntry cufft_errors[] = 
-    {
-        error_entry( CUFFT_INVALID_PLAN ),
-        error_entry( CUFFT_ALLOC_FAILED ),
-        error_entry( CUFFT_INVALID_TYPE ),
-        error_entry( CUFFT_INVALID_VALUE ),
-        error_entry( CUFFT_INTERNAL_ERROR ),
-        error_entry( CUFFT_EXEC_FAILED ),
-        error_entry( CUFFT_SETUP_FAILED ),
-        error_entry( CUFFT_INVALID_SIZE ),
-        error_entry( CUFFT_UNALIGNED_DATA )
-    };
-
-    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUBLAS errors
-
-    const ErrorEntry cublas_errors[] = 
-    {
-        error_entry( CUBLAS_STATUS_SUCCESS ),
-        error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
-        error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
-        error_entry( CUBLAS_STATUS_INVALID_VALUE ),
-        error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
-        error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
-        error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
-        error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
-    };
-
-    const int cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
-}
-
-namespace cv
-{
-    namespace gpu
-    {
-        void nppError(int code, const char *file, const int line, const char *func)
-        {
-            string msg = getErrorString(code, npp_errors, npp_error_num);
-            cv::gpu::error(msg.c_str(), file, line, func);
-        }
-
-        void ncvError(int code, const char *file, const int line, const char *func)
-        {
-            string msg = getErrorString(code, ncv_errors, ncv_error_num);
-            cv::gpu::error(msg.c_str(), file, line, func);
-        }
-
-        void cufftError(int code, const char *file, const int line, const char *func)
-        {
-            string msg = getErrorString(code, cufft_errors, cufft_error_num);
-            cv::gpu::error(msg.c_str(), file, line, func);
-        }
-
-        void cublasError(int code, const char *file, const int line, const char *func)
-        {
-            string msg = getErrorString(code, cublas_errors, cublas_error_num);
-            cv::gpu::error(msg.c_str(), file, line, func);
-        }
-    }
-}
-
-#endif
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#ifdef HAVE_CUDA
+
+namespace
+{
+    #define error_entry(entry)  { entry, #entry }
+
+    struct ErrorEntry
+    {
+        int code;
+        string str;
+    };
+
+    struct ErrorEntryComparer
+    {
+        int code;
+        ErrorEntryComparer(int code_) : code(code_) {};
+        bool operator()(const ErrorEntry& e) const { return e.code == code; }
+    };
+
+    string getErrorString(int code, const ErrorEntry* errors, size_t n)
+    {
+        size_t idx = find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
+
+        const string& msg = (idx != n) ? errors[idx].str : string("Unknown error code");
+
+        ostringstream ostr;
+        ostr << msg << " [Code = " << code << "]";
+
+        return ostr.str();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // NPP errors
+
+    const ErrorEntry npp_errors [] =
+    {
+        error_entry( NPP_NOT_SUPPORTED_MODE_ERROR ),
+        error_entry( NPP_ROUND_MODE_NOT_SUPPORTED_ERROR ),
+        error_entry( NPP_RESIZE_NO_OPERATION_ERROR ),
+
+#if defined (_MSC_VER)
+        error_entry( NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY ),
+#endif
+
+        error_entry( NPP_BAD_ARG_ERROR ),
+        error_entry( NPP_LUT_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_TEXTURE_BIND_ERROR ),
+        error_entry( NPP_COEFF_ERROR ),
+        error_entry( NPP_RECT_ERROR ),
+        error_entry( NPP_QUAD_ERROR ),
+        error_entry( NPP_WRONG_INTERSECTION_ROI_ERROR ),
+        error_entry( NPP_NOT_EVEN_STEP_ERROR ),
+        error_entry( NPP_INTERPOLATION_ERROR ),
+        error_entry( NPP_RESIZE_FACTOR_ERROR ),
+        error_entry( NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR ),
+        error_entry( NPP_MEMFREE_ERR ),
+        error_entry( NPP_MEMSET_ERR ),
+        error_entry( NPP_MEMCPY_ERROR ),
+        error_entry( NPP_MEM_ALLOC_ERR ),
+        error_entry( NPP_HISTO_NUMBER_OF_LEVELS_ERROR ),
+        error_entry( NPP_MIRROR_FLIP_ERR ),
+        error_entry( NPP_INVALID_INPUT ),
+        error_entry( NPP_ALIGNMENT_ERROR ),
+        error_entry( NPP_STEP_ERROR ),
+        error_entry( NPP_SIZE_ERROR ),
+        error_entry( NPP_POINTER_ERROR ),
+        error_entry( NPP_NULL_POINTER_ERROR ),
+        error_entry( NPP_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPP_NOT_IMPLEMENTED_ERROR ),
+        error_entry( NPP_ERROR ),
+        error_entry( NPP_NO_ERROR ),
+        error_entry( NPP_SUCCESS ),
+        error_entry( NPP_WARNING ),
+        error_entry( NPP_WRONG_INTERSECTION_QUAD_WARNING ),
+        error_entry( NPP_MISALIGNED_DST_ROI_WARNING ),
+        error_entry( NPP_AFFINE_QUAD_INCORRECT_WARNING ),
+        error_entry( NPP_DOUBLE_SIZE_WARNING ),
+        error_entry( NPP_ODD_ROI_WARNING )
+    };
+
+    const size_t npp_error_num = sizeof(npp_errors) / sizeof(npp_errors[0]);
+
+    //////////////////////////////////////////////////////////////////////////
+    // NCV errors
+
+    const ErrorEntry ncv_errors [] =
+    {
+        error_entry( NCV_SUCCESS ),
+        error_entry( NCV_UNKNOWN_ERROR ),
+        error_entry( NCV_CUDA_ERROR ),
+        error_entry( NCV_NPP_ERROR ),
+        error_entry( NCV_FILE_ERROR ),
+        error_entry( NCV_NULL_PTR ),
+        error_entry( NCV_INCONSISTENT_INPUT ),
+        error_entry( NCV_TEXTURE_BIND_ERROR ),
+        error_entry( NCV_DIMENSIONS_INVALID ),
+        error_entry( NCV_INVALID_ROI ),
+        error_entry( NCV_INVALID_STEP ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_INVALID_SCALE ),
+        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
+        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
+        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
+        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
+        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
+        error_entry( NCV_MEM_COPY_ERROR ),
+        error_entry( NCV_MEM_RESIDENCE_ERROR ),
+        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
+        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
+        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
+        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
+        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
+        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
+        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
+        error_entry( NPPST_SUCCESS ),
+        error_entry( NPPST_ERROR ),
+        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
+        error_entry( NPPST_NULL_POINTER_ERROR ),
+        error_entry( NPPST_TEXTURE_BIND_ERROR ),
+        error_entry( NPPST_MEMCPY_ERROR ),
+        error_entry( NPPST_MEM_ALLOC_ERR ),
+        error_entry( NPPST_MEMFREE_ERR ),
+        error_entry( NPPST_INVALID_ROI ),
+        error_entry( NPPST_INVALID_STEP ),
+        error_entry( NPPST_INVALID_SCALE ),
+        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
+        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
+        error_entry( NPPST_MEM_INTERNAL_ERROR )
+    };
+
+    const size_t ncv_error_num = sizeof(ncv_errors) / sizeof(ncv_errors[0]);
+
+    //////////////////////////////////////////////////////////////////////////
+    // CUFFT errors
+
+    const ErrorEntry cufft_errors[] =
+    {
+        error_entry( CUFFT_INVALID_PLAN ),
+        error_entry( CUFFT_ALLOC_FAILED ),
+        error_entry( CUFFT_INVALID_TYPE ),
+        error_entry( CUFFT_INVALID_VALUE ),
+        error_entry( CUFFT_INTERNAL_ERROR ),
+        error_entry( CUFFT_EXEC_FAILED ),
+        error_entry( CUFFT_SETUP_FAILED ),
+        error_entry( CUFFT_INVALID_SIZE ),
+        error_entry( CUFFT_UNALIGNED_DATA )
+    };
+
+    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
+
+    //////////////////////////////////////////////////////////////////////////
+    // CUBLAS errors
+
+    const ErrorEntry cublas_errors[] =
+    {
+        error_entry( CUBLAS_STATUS_SUCCESS ),
+        error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
+        error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
+        error_entry( CUBLAS_STATUS_INVALID_VALUE ),
+        error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
+        error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
+        error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
+        error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
+    };
+
+    const int cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
+}
+
+namespace cv
+{
+    namespace gpu
+    {
+        void nppError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, npp_errors, npp_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void ncvError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, ncv_errors, ncv_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void cufftError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, cufft_errors, cufft_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+
+        void cublasError(int code, const char *file, const int line, const char *func)
+        {
+            string msg = getErrorString(code, cublas_errors, cublas_error_num);
+            cv::gpu::error(msg.c_str(), file, line, func);
+        }
+    }
+}
+
+#endif
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@@ -1,177 +1,177 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace std;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::FAST_GPU::FAST_GPU(int, bool, double) { throw_nogpu(); }
-void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::FAST_GPU::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_nogpu(); }
-void cv::gpu::FAST_GPU::release() { throw_nogpu(); }
-int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_nogpu(); return 0; }
-int cv::gpu::FAST_GPU::getKeyPoints(GpuMat&) { throw_nogpu(); return 0; }
-
-#else /* !defined (HAVE_CUDA) */
-
-cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) :
-    nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
-{
-}
-
-void cv::gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    if (image.empty())
-        return;
-
-    (*this)(image, mask, d_keypoints_);
-    downloadKeypoints(d_keypoints_, keypoints);
-}
-
-void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-        return;
-
-    Mat h_keypoints(d_keypoints);
-    convertKeypoints(h_keypoints, keypoints);
-}
-
-void cv::gpu::FAST_GPU::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (h_keypoints.empty())
-        return;
-
-    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
-
-    int npoints = h_keypoints.cols;
-
-    keypoints.resize(npoints);
-
-    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
-    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
-
-    for (int i = 0; i < npoints; ++i)
-    {
-        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
-        keypoints[i] = kp;
-    }
-}
-
-void cv::gpu::FAST_GPU::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
-{
-    calcKeyPointsLocation(img, mask);
-    keypoints.cols = getKeyPoints(keypoints);
-}
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace fast
-    {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
-        int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
-    }
-}}}
-
-int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
-{
-    using namespace cv::gpu::device::fast;
-
-    CV_Assert(img.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
-
-    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
-
-    if (nonmaxSupression)
-    {
-        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
-        score_.setTo(Scalar::all(0));
-    }
-
-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : PtrStepSzi(), threshold);
-    count_ = std::min(count_, maxKeypoints);
-
-    return count_;
-}
-
-int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
-{
-    using namespace cv::gpu::device::fast;
-
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
-    if (count_ == 0)
-        return 0;
-
-    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
-
-    if (nonmaxSupression)
-        return nonmaxSupression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
-
-    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
-    kpLoc_.colRange(0, count_).copyTo(locRow);
-    keypoints.row(1).setTo(Scalar::all(0));
-
-    return count_;
-}
-
-void cv::gpu::FAST_GPU::release()
-{
-    kpLoc_.release();
-    score_.release();
-
-    d_keypoints_.release();
-}
-
-#endif /* !defined (HAVE_CUDA) */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+cv::gpu::FAST_GPU::FAST_GPU(int, bool, double) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_nogpu(); }
+void cv::gpu::FAST_GPU::release() { throw_nogpu(); }
+int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_nogpu(); return 0; }
+int cv::gpu::FAST_GPU::getKeyPoints(GpuMat&) { throw_nogpu(); return 0; }
+
+#else /* !defined (HAVE_CUDA) */
+
+cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) :
+    nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
+{
+}
+
+void cv::gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
+{
+    if (image.empty())
+        return;
+
+    (*this)(image, mask, d_keypoints_);
+    downloadKeypoints(d_keypoints_, keypoints);
+}
+
+void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
+{
+    if (d_keypoints.empty())
+        return;
+
+    Mat h_keypoints(d_keypoints);
+    convertKeypoints(h_keypoints, keypoints);
+}
+
+void cv::gpu::FAST_GPU::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
+{
+    if (h_keypoints.empty())
+        return;
+
+    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
+
+    int npoints = h_keypoints.cols;
+
+    keypoints.resize(npoints);
+
+    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
+    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
+
+    for (int i = 0; i < npoints; ++i)
+    {
+        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
+        keypoints[i] = kp;
+    }
+}
+
+void cv::gpu::FAST_GPU::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
+{
+    calcKeyPointsLocation(img, mask);
+    keypoints.cols = getKeyPoints(keypoints);
+}
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace fast
+    {
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
+        int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
+    }
+}}}
+
+int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
+{
+    using namespace cv::gpu::device::fast;
+
+    CV_Assert(img.type() == CV_8UC1);
+    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
+
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
+
+    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
+
+    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
+
+    if (nonmaxSupression)
+    {
+        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
+        score_.setTo(Scalar::all(0));
+    }
+
+    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : PtrStepSzi(), threshold);
+    count_ = std::min(count_, maxKeypoints);
+
+    return count_;
+}
+
+int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
+{
+    using namespace cv::gpu::device::fast;
+
+    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
+        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
+
+    if (count_ == 0)
+        return 0;
+
+    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
+
+    if (nonmaxSupression)
+        return nonmaxSupression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
+
+    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
+    kpLoc_.colRange(0, count_).copyTo(locRow);
+    keypoints.row(1).setTo(Scalar::all(0));
+
+    return count_;
+}
+
+void cv::gpu::FAST_GPU::release()
+{
+    kpLoc_.release();
+    score_.release();
+
+    d_keypoints_.release();
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/ffmpeg_video_source.cpp
+++ b/modules/gpu/src/ffmpeg_video_source.cpp
@@ -1,182 +1,182 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "ffmpeg_video_source.h"
-
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
-
-#ifdef HAVE_FFMPEG
-    #include "cap_ffmpeg_impl.hpp"
-#else
-    #include "cap_ffmpeg_api.hpp"
-#endif
-
-namespace
-{
-    Create_InputMediaStream_FFMPEG_Plugin create_InputMediaStream_FFMPEG_p = 0;
-    Release_InputMediaStream_FFMPEG_Plugin release_InputMediaStream_FFMPEG_p = 0;
-    Read_InputMediaStream_FFMPEG_Plugin read_InputMediaStream_FFMPEG_p = 0;
-
-    bool init_MediaStream_FFMPEG()
-    {
-        static bool initialized = 0;
-
-        if (!initialized)
-        {
-            #if defined WIN32 || defined _WIN32
-                const char* module_name = "opencv_ffmpeg"
-                    CVAUX_STR(CV_MAJOR_VERSION) CVAUX_STR(CV_MINOR_VERSION) CVAUX_STR(CV_SUBMINOR_VERSION)
-                #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
-                    "_64"
-                #endif
-                    ".dll";
-
-                static HMODULE cvFFOpenCV = LoadLibrary(module_name);
-
-                if (cvFFOpenCV)
-                {
-                    create_InputMediaStream_FFMPEG_p =
-                        (Create_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "create_InputMediaStream_FFMPEG");
-                    release_InputMediaStream_FFMPEG_p =
-                        (Release_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "release_InputMediaStream_FFMPEG");
-                    read_InputMediaStream_FFMPEG_p =
-                        (Read_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "read_InputMediaStream_FFMPEG");
-
-                    initialized = create_InputMediaStream_FFMPEG_p != 0 && release_InputMediaStream_FFMPEG_p != 0 && read_InputMediaStream_FFMPEG_p != 0;
-                }
-            #elif defined HAVE_FFMPEG
-                create_InputMediaStream_FFMPEG_p = create_InputMediaStream_FFMPEG;
-                release_InputMediaStream_FFMPEG_p = release_InputMediaStream_FFMPEG;
-                read_InputMediaStream_FFMPEG_p = read_InputMediaStream_FFMPEG;
-
-                initialized = true;
-            #endif
-        }
-
-        return initialized;
-    }
-}
-
-cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const std::string& fname) :
-    stream_(0)
-{
-    CV_Assert( init_MediaStream_FFMPEG() );
-
-    int codec;
-    int chroma_format;
-    int width;
-    int height;
-
-    stream_ = create_InputMediaStream_FFMPEG_p(fname.c_str(), &codec, &chroma_format, &width, &height);
-    if (!stream_)
-        CV_Error(CV_StsUnsupportedFormat, "Unsupported video source");
-
-    format_.codec = static_cast<VideoReader_GPU::Codec>(codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(chroma_format);
-    format_.width = width;
-    format_.height = height;
-}
-
-cv::gpu::detail::FFmpegVideoSource::~FFmpegVideoSource()
-{
-    release_InputMediaStream_FFMPEG_p(stream_);
-}
-
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::FFmpegVideoSource::format() const
-{
-    return format_;
-}
-
-void cv::gpu::detail::FFmpegVideoSource::start()
-{
-    stop_ = false;
-    hasError_ = false;
-    thread_.reset(new Thread(readLoop, this));
-}
-
-void cv::gpu::detail::FFmpegVideoSource::stop()
-{
-    stop_ = true;
-    thread_->wait();
-    thread_.reset();
-}
-
-bool cv::gpu::detail::FFmpegVideoSource::isStarted() const
-{
-    return !stop_;
-}
-
-bool cv::gpu::detail::FFmpegVideoSource::hasError() const
-{
-    return hasError_;
-}
-
-void cv::gpu::detail::FFmpegVideoSource::readLoop(void* userData)
-{
-    FFmpegVideoSource* thiz = static_cast<FFmpegVideoSource*>(userData);
-
-    for (;;)
-    {
-        unsigned char* data;
-        int size;
-        int endOfFile;
-
-        if (!read_InputMediaStream_FFMPEG_p(thiz->stream_, &data, &size, &endOfFile))
-        {
-            thiz->hasError_ = !endOfFile;
-            break;
-        }
-
-        if (!thiz->parseVideoData(data, size))
-        {
-            thiz->hasError_ = true;
-            break;
-        }
-
-        if (thiz->stop_)
-            break;
-    }
-
-    thiz->parseVideoData(0, 0, true);
-}
-
-#endif // HAVE_CUDA
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "ffmpeg_video_source.h"
+
+#if defined(HAVE_CUDA) && !defined(__APPLE__)
+
+#ifdef HAVE_FFMPEG
+    #include "cap_ffmpeg_impl.hpp"
+#else
+    #include "cap_ffmpeg_api.hpp"
+#endif
+
+namespace
+{
+    Create_InputMediaStream_FFMPEG_Plugin create_InputMediaStream_FFMPEG_p = 0;
+    Release_InputMediaStream_FFMPEG_Plugin release_InputMediaStream_FFMPEG_p = 0;
+    Read_InputMediaStream_FFMPEG_Plugin read_InputMediaStream_FFMPEG_p = 0;
+
+    bool init_MediaStream_FFMPEG()
+    {
+        static bool initialized = 0;
+
+        if (!initialized)
+        {
+            #if defined WIN32 || defined _WIN32
+                const char* module_name = "opencv_ffmpeg"
+                    CVAUX_STR(CV_MAJOR_VERSION) CVAUX_STR(CV_MINOR_VERSION) CVAUX_STR(CV_SUBMINOR_VERSION)
+                #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
+                    "_64"
+                #endif
+                    ".dll";
+
+                static HMODULE cvFFOpenCV = LoadLibrary(module_name);
+
+                if (cvFFOpenCV)
+                {
+                    create_InputMediaStream_FFMPEG_p =
+                        (Create_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "create_InputMediaStream_FFMPEG");
+                    release_InputMediaStream_FFMPEG_p =
+                        (Release_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "release_InputMediaStream_FFMPEG");
+                    read_InputMediaStream_FFMPEG_p =
+                        (Read_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "read_InputMediaStream_FFMPEG");
+
+                    initialized = create_InputMediaStream_FFMPEG_p != 0 && release_InputMediaStream_FFMPEG_p != 0 && read_InputMediaStream_FFMPEG_p != 0;
+                }
+            #elif defined HAVE_FFMPEG
+                create_InputMediaStream_FFMPEG_p = create_InputMediaStream_FFMPEG;
+                release_InputMediaStream_FFMPEG_p = release_InputMediaStream_FFMPEG;
+                read_InputMediaStream_FFMPEG_p = read_InputMediaStream_FFMPEG;
+
+                initialized = true;
+            #endif
+        }
+
+        return initialized;
+    }
+}
+
+cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const std::string& fname) :
+    stream_(0)
+{
+    CV_Assert( init_MediaStream_FFMPEG() );
+
+    int codec;
+    int chroma_format;
+    int width;
+    int height;
+
+    stream_ = create_InputMediaStream_FFMPEG_p(fname.c_str(), &codec, &chroma_format, &width, &height);
+    if (!stream_)
+        CV_Error(CV_StsUnsupportedFormat, "Unsupported video source");
+
+    format_.codec = static_cast<VideoReader_GPU::Codec>(codec);
+    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(chroma_format);
+    format_.width = width;
+    format_.height = height;
+}
+
+cv::gpu::detail::FFmpegVideoSource::~FFmpegVideoSource()
+{
+    release_InputMediaStream_FFMPEG_p(stream_);
+}
+
+cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::FFmpegVideoSource::format() const
+{
+    return format_;
+}
+
+void cv::gpu::detail::FFmpegVideoSource::start()
+{
+    stop_ = false;
+    hasError_ = false;
+    thread_.reset(new Thread(readLoop, this));
+}
+
+void cv::gpu::detail::FFmpegVideoSource::stop()
+{
+    stop_ = true;
+    thread_->wait();
+    thread_.reset();
+}
+
+bool cv::gpu::detail::FFmpegVideoSource::isStarted() const
+{
+    return !stop_;
+}
+
+bool cv::gpu::detail::FFmpegVideoSource::hasError() const
+{
+    return hasError_;
+}
+
+void cv::gpu::detail::FFmpegVideoSource::readLoop(void* userData)
+{
+    FFmpegVideoSource* thiz = static_cast<FFmpegVideoSource*>(userData);
+
+    for (;;)
+    {
+        unsigned char* data;
+        int size;
+        int endOfFile;
+
+        if (!read_InputMediaStream_FFMPEG_p(thiz->stream_, &data, &size, &endOfFile))
+        {
+            thiz->hasError_ = !endOfFile;
+            break;
+        }
+
+        if (!thiz->parseVideoData(data, size))
+        {
+            thiz->hasError_ = true;
+            break;
+        }
+
+        if (thiz->stop_)
+            break;
+    }
+
+    thiz->parseVideoData(0, 0, true);
+}
+
+#endif // HAVE_CUDA
--- a/modules/gpu/src/ffmpeg_video_source.h
+++ b/modules/gpu/src/ffmpeg_video_source.h
@@ -1,88 +1,88 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __FFMPEG_VIDEO_SOURCE_H__
-#define __FFMPEG_VIDEO_SOURCE_H__
-
-#include "precomp.hpp"
-#include "thread_wrappers.h"
-
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
-
-struct InputMediaStream_FFMPEG;
-
-namespace cv { namespace gpu
-{
-    namespace detail
-    {
-        class FFmpegVideoSource : public VideoReader_GPU::VideoSource
-        {
-        public:
-            FFmpegVideoSource(const std::string& fname);
-            ~FFmpegVideoSource();
-
-            VideoReader_GPU::FormatInfo format() const;
-            void start();
-            void stop();
-            bool isStarted() const;
-            bool hasError() const;
-
-        private:
-            FFmpegVideoSource(const FFmpegVideoSource&);
-            FFmpegVideoSource& operator =(const FFmpegVideoSource&);
-
-            VideoReader_GPU::FormatInfo format_;
-
-            InputMediaStream_FFMPEG* stream_;
-
-            std::auto_ptr<Thread> thread_;
-            volatile bool stop_;
-            volatile bool hasError_;
-
-            static void readLoop(void* userData);
-        };
-    }
-}}
-
-#endif // HAVE_CUDA
-
-#endif // __CUVUD_VIDEO_SOURCE_H__
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __FFMPEG_VIDEO_SOURCE_H__
+#define __FFMPEG_VIDEO_SOURCE_H__
+
+#include "precomp.hpp"
+#include "thread_wrappers.h"
+
+#if defined(HAVE_CUDA) && !defined(__APPLE__)
+
+struct InputMediaStream_FFMPEG;
+
+namespace cv { namespace gpu
+{
+    namespace detail
+    {
+        class FFmpegVideoSource : public VideoReader_GPU::VideoSource
+        {
+        public:
+            FFmpegVideoSource(const std::string& fname);
+            ~FFmpegVideoSource();
+
+            VideoReader_GPU::FormatInfo format() const;
+            void start();
+            void stop();
+            bool isStarted() const;
+            bool hasError() const;
+
+        private:
+            FFmpegVideoSource(const FFmpegVideoSource&);
+            FFmpegVideoSource& operator =(const FFmpegVideoSource&);
+
+            VideoReader_GPU::FormatInfo format_;
+
+            InputMediaStream_FFMPEG* stream_;
+
+            std::auto_ptr<Thread> thread_;
+            volatile bool stop_;
+            volatile bool hasError_;
+
+            static void readLoop(void* userData);
+        };
+    }
+}}
+
+#endif // HAVE_CUDA
+
+#endif // __CUVUD_VIDEO_SOURCE_H__
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/frame_queue.cpp
+++ b/modules/gpu/src/frame_queue.cpp
@@ -1,117 +1,117 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "frame_queue.h"
-
-#if defined(HAVE_CUDA) && !defined(__APPLE__)
-
-cv::gpu::detail::FrameQueue::FrameQueue() :
-    endOfDecode_(0),
-    framesInQueue_(0),
-    readPosition_(0)
-{
-    std::memset(displayQueue_, 0, sizeof(displayQueue_));
-    std::memset((void*)isFrameInUse_, 0, sizeof(isFrameInUse_));
-}
-
-bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
-{
-    while (isInUse(pictureIndex))
-    {
-        // Decoder is getting too far ahead from display
-        Thread::sleep(1);
-
-        if (isEndOfDecode())
-            return false;
-    }
-
-    return true;
-}
-
-void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
-{
-    // Mark the frame as 'in-use' so we don't re-use it for decoding until it is no longer needed
-    // for display
-    isFrameInUse_[picParams->picture_index] = true;
-
-    // Wait until we have a free entry in the display queue (should never block if we have enough entries)
-    do
-    {
-        bool isFramePlaced = false;
-
-        {
-            CriticalSection::AutoLock autoLock(criticalSection_);
-
-            if (framesInQueue_ < MaximumSize)
-            {
-                int writePosition = (readPosition_ + framesInQueue_) % MaximumSize;
-                displayQueue_[writePosition] = *picParams;
-                framesInQueue_++;
-                isFramePlaced = true;
-            }
-        }
-
-        if (isFramePlaced) // Done
-            break;
-
-        // Wait a bit
-        Thread::sleep(1);
-    } while (!isEndOfDecode());
-}
-
-bool cv::gpu::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
-{
-    CriticalSection::AutoLock autoLock(criticalSection_);
-
-    if (framesInQueue_ > 0)
-    {
-        int entry = readPosition_;
-        displayInfo = displayQueue_[entry];
-        readPosition_ = (entry + 1) % MaximumSize;
-        framesInQueue_--;
-        return true;
-    }
-
-    return false;
-}
-
-#endif // HAVE_CUDA
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "frame_queue.h"
+
+#if defined(HAVE_CUDA) && !defined(__APPLE__)
+
+cv::gpu::detail::FrameQueue::FrameQueue() :
+    endOfDecode_(0),
+    framesInQueue_(0),
+    readPosition_(0)
+{
+    std::memset(displayQueue_, 0, sizeof(displayQueue_));
+    std::memset((void*)isFrameInUse_, 0, sizeof(isFrameInUse_));
+}
+
+bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
+{
+    while (isInUse(pictureIndex))
+    {
+        // Decoder is getting too far ahead from display
+        Thread::sleep(1);
+
+        if (isEndOfDecode())
+            return false;
+    }
+
+    return true;
+}
+
+void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
+{
+    // Mark the frame as 'in-use' so we don't re-use it for decoding until it is no longer needed
+    // for display
+    isFrameInUse_[picParams->picture_index] = true;
+
+    // Wait until we have a free entry in the display queue (should never block if we have enough entries)
+    do
+    {
+        bool isFramePlaced = false;
+
+        {
+            CriticalSection::AutoLock autoLock(criticalSection_);
+
+            if (framesInQueue_ < MaximumSize)
+            {
+                int writePosition = (readPosition_ + framesInQueue_) % MaximumSize;
+                displayQueue_[writePosition] = *picParams;
+                framesInQueue_++;
+                isFramePlaced = true;
+            }
+        }
+
+        if (isFramePlaced) // Done
+            break;
+
+        // Wait a bit
+        Thread::sleep(1);
+    } while (!isEndOfDecode());
+}
+
+bool cv::gpu::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
+{
+    CriticalSection::AutoLock autoLock(criticalSection_);
+
+    if (framesInQueue_ > 0)
+    {
+        int entry = readPosition_;
+        displayInfo = displayQueue_[entry];
+        readPosition_ = (entry + 1) % MaximumSize;
+        framesInQueue_--;
+        return true;
+    }
+
+    return false;
+}
+
+#endif // HAVE_CUDA
--- a/Show More
+++ b/Show More