Normalize line endings and whitespace

2012-10-17 03:18:30 +04:00
parent 69020da607
commit 04384a71e4
1516 changed files with 258846 additions and 258162 deletions
--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@@ -1,212 +1,212 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/*
-    NV12ToARGB color space conversion CUDA kernel
-
-    This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
-    source and converts to output in ARGB format
-*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu { namespace device {
-    namespace video_decoding
-    {
-        __constant__ uint constAlpha = ((uint)0xff << 24);
-
-        __constant__ float constHueColorSpaceMat[9];
-
-        void loadHueCSC(float hueCSC[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
-        }
-
-        __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
-        {
-            float luma, chromaCb, chromaCr;
-
-            // Prepare for hue adjustment
-            luma     = (float)yuvi[0];
-            chromaCb = (float)((int)yuvi[1] - 512.0f);
-            chromaCr = (float)((int)yuvi[2] - 512.0f);
-
-           // Convert YUV To RGB with hue adjustment
-           *red   = (luma     * constHueColorSpaceMat[0]) +
-                    (chromaCb * constHueColorSpaceMat[1]) +
-                    (chromaCr * constHueColorSpaceMat[2]);
-
-           *green = (luma     * constHueColorSpaceMat[3]) +
-                    (chromaCb * constHueColorSpaceMat[4]) +
-                    (chromaCr * constHueColorSpaceMat[5]);
-
-           *blue  = (luma     * constHueColorSpaceMat[6]) +
-                    (chromaCb * constHueColorSpaceMat[7]) +
-                    (chromaCr * constHueColorSpaceMat[8]);
-        }
-
-        __device__ uint RGBAPACK_10bit(float red, float green, float blue, uint alpha)
-        {
-            uint ARGBpixel = 0;
-
-            // Clamp final 10 bit results
-            red   = ::fmin(::fmax(red,   0.0f), 1023.f);
-            green = ::fmin(::fmax(green, 0.0f), 1023.f);
-            blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
-
-            // Convert to 8 bit unsigned integers per color component
-            ARGBpixel = (((uint)blue  >> 2) |
-                        (((uint)green >> 2) << 8)  |
-                        (((uint)red   >> 2) << 16) |
-                        (uint)alpha);
-
-            return ARGBpixel;
-        }
-
-        // CUDA kernel for outputing the final ARGB output from NV12
-
-        #define COLOR_COMPONENT_BIT_SIZE 10
-        #define COLOR_COMPONENT_MASK     0x3FF
-
-        __global__ void NV12ToARGB(uchar* srcImage, size_t nSourcePitch,
-                                   uint* dstImage, size_t nDestPitch,
-                                   uint width, uint height)
-        {
-            // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-            const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-            const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
-
-            if (x >= width || y >= height)
-                return;
-
-            // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-            // if we move to texture we could read 4 luminance values
-
-            uint yuv101010Pel[2];
-
-            yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
-            yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
-
-            const size_t chromaOffset = nSourcePitch * height;
-
-            const int y_chroma = y >> 1;
-
-            if (y & 1)  // odd scanline ?
-            {
-                uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
-                uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
-
-                if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
-                {
-                    chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
-                    chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
-                }
-
-                yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-                yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-            }
-            else
-            {
-                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-            }
-
-            // this steps performs the color conversion
-            uint yuvi[6];
-            float red[2], green[2], blue[2];
-
-            yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
-            yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-            yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-            yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
-            yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-            yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-            // YUV to RGB Transformation conversion
-            YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
-            YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-            // Clamp the results to RGBA
-
-            const size_t dstImagePitch = nDestPitch >> 2;
-
-            dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
-            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
-        }
-
-        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
-
-            NV12ToARGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
-                interopFrame.cols, interopFrame.rows);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    NV12ToARGB color space conversion CUDA kernel
+
+    This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
+    source and converts to output in ARGB format
+*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device {
+    namespace video_decoding
+    {
+        __constant__ uint constAlpha = ((uint)0xff << 24);
+
+        __constant__ float constHueColorSpaceMat[9];
+
+        void loadHueCSC(float hueCSC[9])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
+        }
+
+        __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
+        {
+            float luma, chromaCb, chromaCr;
+
+            // Prepare for hue adjustment
+            luma     = (float)yuvi[0];
+            chromaCb = (float)((int)yuvi[1] - 512.0f);
+            chromaCr = (float)((int)yuvi[2] - 512.0f);
+
+           // Convert YUV To RGB with hue adjustment
+           *red   = (luma     * constHueColorSpaceMat[0]) +
+                    (chromaCb * constHueColorSpaceMat[1]) +
+                    (chromaCr * constHueColorSpaceMat[2]);
+
+           *green = (luma     * constHueColorSpaceMat[3]) +
+                    (chromaCb * constHueColorSpaceMat[4]) +
+                    (chromaCr * constHueColorSpaceMat[5]);
+
+           *blue  = (luma     * constHueColorSpaceMat[6]) +
+                    (chromaCb * constHueColorSpaceMat[7]) +
+                    (chromaCr * constHueColorSpaceMat[8]);
+        }
+
+        __device__ uint RGBAPACK_10bit(float red, float green, float blue, uint alpha)
+        {
+            uint ARGBpixel = 0;
+
+            // Clamp final 10 bit results
+            red   = ::fmin(::fmax(red,   0.0f), 1023.f);
+            green = ::fmin(::fmax(green, 0.0f), 1023.f);
+            blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
+
+            // Convert to 8 bit unsigned integers per color component
+            ARGBpixel = (((uint)blue  >> 2) |
+                        (((uint)green >> 2) << 8)  |
+                        (((uint)red   >> 2) << 16) |
+                        (uint)alpha);
+
+            return ARGBpixel;
+        }
+
+        // CUDA kernel for outputing the final ARGB output from NV12
+
+        #define COLOR_COMPONENT_BIT_SIZE 10
+        #define COLOR_COMPONENT_MASK     0x3FF
+
+        __global__ void NV12ToARGB(uchar* srcImage, size_t nSourcePitch,
+                                   uint* dstImage, size_t nDestPitch,
+                                   uint width, uint height)
+        {
+            // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
+            const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
+            const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
+
+            if (x >= width || y >= height)
+                return;
+
+            // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
+            // if we move to texture we could read 4 luminance values
+
+            uint yuv101010Pel[2];
+
+            yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
+            yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
+
+            const size_t chromaOffset = nSourcePitch * height;
+
+            const int y_chroma = y >> 1;
+
+            if (y & 1)  // odd scanline ?
+            {
+                uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
+                uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
+
+                if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
+                {
+                    chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
+                    chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
+                }
+
+                yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+                yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+            }
+            else
+            {
+                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+
+                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
+                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
+            }
+
+            // this steps performs the color conversion
+            uint yuvi[6];
+            float red[2], green[2], blue[2];
+
+            yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
+            yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+            yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+            yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
+            yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
+            yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
+
+            // YUV to RGB Transformation conversion
+            YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
+            YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
+
+            // Clamp the results to RGBA
+
+            const size_t dstImagePitch = nDestPitch >> 2;
+
+            dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
+            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
+        }
+
+        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
+
+            NV12ToARGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
+                interopFrame.cols, interopFrame.rows);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -1,472 +1,472 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/vec_distance.hpp"
-#include "opencv2/gpu/device/datamov_utils.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace bf_radius_match
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match Unrolled
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
-            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Dist dist;
-
-            #pragma unroll
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                if (loadX < query.cols)
-                {
-                    T val;
-
-                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                }
-
-                __syncthreads();
-
-                #pragma unroll
-                for (int j = 0; j < BLOCK_SIZE; ++j)
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                __syncthreads();
-            }
-
-            float distVal = (typename Dist::result_type)dist;
-
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-            {
-                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                if (ind < maxCount)
-                {
-                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                    bestDistance.ptr(queryIdx)[ind] = distVal;
-                }
-            }
-
-            #endif
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
-                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            for (int i = 0; i < n; ++i)
-            {
-                const PtrStepSz<T> train = trains[i];
-
-                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-                if (masks != 0 && masks[i].data)
-                {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                else
-                {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match
-
-        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
-            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
-
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Dist dist;
-
-            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                if (loadX < query.cols)
-                {
-                    T val;
-
-                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                }
-
-                __syncthreads();
-
-                #pragma unroll
-                for (int j = 0; j < BLOCK_SIZE; ++j)
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                __syncthreads();
-            }
-
-            float distVal = (typename Dist::result_type)dist;
-
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-            {
-                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                if (ind < maxCount)
-                {
-                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                    bestDistance.ptr(queryIdx)[ind] = distVal;
-                }
-            }
-
-            #endif
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
-                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            for (int i = 0; i < n; ++i)
-            {
-                const PtrStepSz<T> train = trains[i];
-
-                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-                if (masks != 0 && masks[i].data)
-                {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                else
-                {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match dispatcher
-
-        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
-        {
-            (void)cc;
-            if (query.cols <= 64)
-            {
-                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-        }
-
-        template <typename Dist, typename T>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             int cc, cudaStream_t stream)
-        {
-            (void)cc;
-            if (query.cols <= 64)
-            {
-                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Radius Match caller
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-            else
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-            else
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-            else
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    cc, stream);
-            }
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            int cc, cudaStream_t stream)
-        {
-            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                cc, stream);
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    } // namespace bf_radius_match
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/vec_distance.hpp"
+#include "opencv2/gpu/device/datamov_utils.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace bf_radius_match
+    {
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match Unrolled
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
+
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            #pragma unroll
+            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+
+            #endif
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
+        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match
+
+        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
+            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
+
+            extern __shared__ int smem[];
+
+            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+            Dist dist;
+
+            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+            {
+                const int loadX = threadIdx.x + i * BLOCK_SIZE;
+
+                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
+
+                if (loadX < query.cols)
+                {
+                    T val;
+
+                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
+
+                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+                }
+
+                __syncthreads();
+
+                #pragma unroll
+                for (int j = 0; j < BLOCK_SIZE; ++j)
+                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+                __syncthreads();
+            }
+
+            float distVal = (typename Dist::result_type)dist;
+
+            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
+            {
+                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+                if (ind < maxCount)
+                {
+                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+                    bestDistance.ptr(queryIdx)[ind] = distVal;
+                }
+            }
+
+            #endif
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
+                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <int BLOCK_SIZE, typename Dist, typename T>
+        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            cudaStream_t stream)
+        {
+            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+            for (int i = 0; i < n; ++i)
+            {
+                const PtrStepSz<T> train = trains[i];
+
+                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+                if (masks != 0 && masks[i].data)
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                else
+                {
+                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
+                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+                }
+                cudaSafeCall( cudaGetLastError() );
+            }
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Match dispatcher
+
+        template <typename Dist, typename T, typename Mask>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
+                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             int cc, cudaStream_t stream)
+        {
+            (void)cc;
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+            }
+        }
+
+        template <typename Dist, typename T>
+        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
+                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+                             int cc, cudaStream_t stream)
+        {
+            (void)cc;
+            if (query.cols <= 64)
+            {
+                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 128)
+            {
+                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            /*else if (query.cols <= 256)
+            {
+                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 512)
+            {
+                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+            else if (query.cols <= 1024)
+            {
+                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }*/
+            else
+            {
+                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+            }
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // Radius Match caller
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
+            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            if (mask.data)
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+            else
+            {
+                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
+                    trainIdx, distance, nMatches,
+                    cc, stream);
+            }
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                cc, stream);
+        }
+
+        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                cc, stream);
+        }
+
+        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
+            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
+            int cc, cudaStream_t stream)
+        {
+            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
+                trainIdx, imgIdx, distance, nMatches,
+                cc, stream);
+        }
+
+        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, int cc, cudaStream_t stream);
+    } // namespace bf_radius_match
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -1,201 +1,201 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-/// Bilateral filtering
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
-        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
-        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
-        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
-
-        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
-
-        template<typename T, typename B> 
-        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-            
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= src.cols || y >= src.rows)
-                return;
-
-            value_type center = saturate_cast<value_type>(src(y, x));
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0;
-
-            int r = ksz / 2;
-            float r2 = (float)(r * r);
-
-            int tx = x - r + ksz;
-            int ty = y - r + ksz;
-
-            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(src(cy, cx));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            else
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            dst(y, x) = saturate_cast<T>(sum1 / sum2);
-        }
-
-        template<typename T, template <typename> class B>
-        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
-             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
-
-            static caller_t funcs[] = 
-            {
-                bilateral_caller<T, BrdReflect101>,
-                bilateral_caller<T, BrdReplicate>,
-                bilateral_caller<T, BrdConstant>,
-                bilateral_caller<T, BrdReflect>,
-                bilateral_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
-        }
-    }
-}}}
-
-
-#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
-    template void cv::gpu::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
-
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(short)
-//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
-OCV_INSTANTIATE_BILATERAL_FILTER(short3)
-OCV_INSTANTIATE_BILATERAL_FILTER(short4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
-//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(int)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(float)
-//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
-OCV_INSTANTIATE_BILATERAL_FILTER(float3)
-OCV_INSTANTIATE_BILATERAL_FILTER(float4)
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+using namespace cv::gpu;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+/// Bilateral filtering
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
+        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
+        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
+        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
+
+        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
+
+        template<typename T, typename B>
+        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            value_type center = saturate_cast<value_type>(src(y, x));
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0;
+
+            int r = ksz / 2;
+            float r2 = (float)(r * r);
+
+            int tx = x - r + ksz;
+            int ty = y - r + ksz;
+
+            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(src(cy, cx));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            else
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            dst(y, x) = saturate_cast<T>(sum1 / sum2);
+        }
+
+        template<typename T, template <typename> class B>
+        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
+             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
+
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdReflect101>,
+                bilateral_caller<T, BrdReplicate>,
+                bilateral_caller<T, BrdConstant>,
+                bilateral_caller<T, BrdReflect>,
+                bilateral_caller<T, BrdWrap>,
+            };
+            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+        }
+    }
+}}}
+
+
+#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
+    template void cv::gpu::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(short)
+//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
+OCV_INSTANTIATE_BILATERAL_FILTER(short3)
+OCV_INSTANTIATE_BILATERAL_FILTER(short4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
+//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(int)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(float)
+//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
+OCV_INSTANTIATE_BILATERAL_FILTER(float3)
+OCV_INSTANTIATE_BILATERAL_FILTER(float4)
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -1,121 +1,121 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace blend
-    {
-        template <typename T>
-        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                int x_ = x / cn;
-                float w1 = weights1.ptr(y)[x_];
-                float w2 = weights2.ptr(y)[x_];
-                T p1 = img1.ptr(y)[x];
-                T p2 = img2.ptr(y)[x];
-                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-            }
-        }
-
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
-
-
-        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                float w1 = weights1.ptr(y)[x];
-                float w2 = weights2.ptr(y)[x];
-                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-                w1 *= sum_inv;
-                w2 *= sum_inv;
-                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-            }
-        }
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace blend
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }
+
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -1,384 +1,384 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <internal_shared.hpp>
-#include <opencv2/gpu/device/transform.hpp>
-#include <opencv2/gpu/device/color.hpp>
-#include <cvt_colot_internal.h>
-
-namespace cv { namespace gpu { namespace device
-{
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
-    { \
-        traits::functor_type functor = traits::create_functor(); \
-        typedef typename traits::functor_type::argument_type src_t; \
-        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
-    }
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
-
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <internal_shared.hpp>
+#include <opencv2/gpu/device/transform.hpp>
+#include <opencv2/gpu/device/color.hpp>
+#include <cvt_colot_internal.h>
+
+namespace cv { namespace gpu { namespace device
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
+    { \
+        traits::functor_type functor = traits::create_functor(); \
+        typedef typename traits::functor_type::argument_type src_t; \
+        typedef typename traits::functor_type::result_type   dst_t; \
+        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -1,388 +1,388 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace column_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 16;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
-            #else
-                const int BLOCK_DIM_X = 16;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 2;
-                const int HALO_SIZE = 2;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
-
-            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-
-            if (x >= src.cols)
-                return;
-
-            const T* src_col = src.ptr() + x;
-
-            const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
-
-            if (blockIdx.y > 0)
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Upper halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            if (blockIdx.y + 2 < gridDim.y)
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
-            }
-            else
-            {
-                //Main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
-
-                //Lower halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int y = yStart + j * BLOCK_DIM_Y;
-
-                if (y < src.rows)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 16;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 16;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 2;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-
-            B<T> brd(src.rows);
-
-            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect101>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect101>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<10, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<11, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<12, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<13, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<14, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<15, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<16, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<17, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<18, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<19, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<20, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<21, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<22, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<23, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<24, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<25, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<26, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<27, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<28, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<29, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<30, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<31, T, D, BrdColReplicate>,
-                    linearColumnFilter_caller<32, T, D, BrdColReplicate>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 2, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 3, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 4, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 5, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 6, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 7, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 8, T, D, BrdColConstant>,
-                    linearColumnFilter_caller< 9, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<10, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<11, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<12, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<13, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<14, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<15, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<16, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<17, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<18, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<19, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<20, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<21, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<22, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<23, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<24, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<25, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<26, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<27, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<28, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<29, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<30, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<31, T, D, BrdColConstant>,
-                    linearColumnFilter_caller<32, T, D, BrdColConstant>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 2, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 3, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 4, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 5, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 6, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 7, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 8, T, D, BrdColReflect>,
-                    linearColumnFilter_caller< 9, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<10, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<11, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<12, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<13, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<14, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<15, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<16, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<17, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<18, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<19, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<20, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<21, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<22, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<23, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<24, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<25, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<26, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<27, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<28, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<29, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<30, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<31, T, D, BrdColReflect>,
-                    linearColumnFilter_caller<32, T, D, BrdColReflect>
-                },
-                {
-                    0,
-                    linearColumnFilter_caller< 1, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 2, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 3, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 4, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 5, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 6, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 7, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 8, T, D, BrdColWrap>,
-                    linearColumnFilter_caller< 9, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<10, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<11, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<12, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<13, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<14, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<15, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<17, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<18, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<19, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<20, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<21, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<22, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<23, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<24, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<25, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<26, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<27, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<28, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<29, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
-                    linearColumnFilter_caller<32, T, D, BrdColWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace column_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/static_check.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace column_filter
+    {
+        #define MAX_KERNEL_SIZE 32
+
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
+        {
+            if (stream == 0)
+                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+            else
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+        }
+
+        template <int KSIZE, typename T, typename D, typename B>
+        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+                const int BLOCK_DIM_X = 16;
+                const int BLOCK_DIM_Y = 16;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
+            #else
+                const int BLOCK_DIM_X = 16;
+                const int BLOCK_DIM_Y = 8;
+                const int PATCH_PER_BLOCK = 2;
+                const int HALO_SIZE = 2;
+            #endif
+
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
+
+            const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+
+            if (x >= src.cols)
+                return;
+
+            const T* src_col = src.ptr() + x;
+
+            const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
+
+            if (blockIdx.y > 0)
+            {
+                //Upper halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
+            }
+            else
+            {
+                //Upper halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
+            }
+
+            if (blockIdx.y + 2 < gridDim.y)
+            {
+                //Main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
+
+                //Lower halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
+            }
+            else
+            {
+                //Main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
+
+                //Lower halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+            {
+                const int y = yStart + j * BLOCK_DIM_Y;
+
+                if (y < src.rows)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
+
+                    #pragma unroll
+                    for (int k = 0; k < KSIZE; ++k)
+                        sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+
+                    dst(y, x) = saturate_cast<D>(sum);
+                }
+            }
+        }
+
+        template <int KSIZE, typename T, typename D, template<typename> class B>
+        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+        {
+            int BLOCK_DIM_X;
+            int BLOCK_DIM_Y;
+            int PATCH_PER_BLOCK;
+
+            if (cc >= 20)
+            {
+                BLOCK_DIM_X = 16;
+                BLOCK_DIM_Y = 16;
+                PATCH_PER_BLOCK = 4;
+            }
+            else
+            {
+                BLOCK_DIM_X = 16;
+                BLOCK_DIM_Y = 8;
+                PATCH_PER_BLOCK = 2;
+            }
+
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
+
+            B<T> brd(src.rows);
+
+            linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, typename D>
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+            static const caller_t callers[5][33] =
+            {
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 2, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 3, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 4, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 5, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 6, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 7, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 8, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller< 9, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<10, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<11, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<12, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<13, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<14, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<15, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<16, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<17, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<18, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<19, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<20, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<21, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<22, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<23, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<24, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<25, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<26, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<27, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<28, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<29, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<30, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<31, T, D, BrdColReflect101>,
+                    linearColumnFilter_caller<32, T, D, BrdColReflect101>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 2, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 3, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 4, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 5, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 6, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 7, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 8, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller< 9, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<10, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<11, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<12, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<13, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<14, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<15, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<16, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<17, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<18, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<19, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<20, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<21, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<22, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<23, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<24, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<25, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<26, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<27, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<28, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<29, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<30, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<31, T, D, BrdColReplicate>,
+                    linearColumnFilter_caller<32, T, D, BrdColReplicate>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 2, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 3, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 4, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 5, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 6, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 7, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 8, T, D, BrdColConstant>,
+                    linearColumnFilter_caller< 9, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<10, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<11, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<12, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<13, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<14, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<15, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<16, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<17, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<18, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<19, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<20, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<21, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<22, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<23, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<24, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<25, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<26, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<27, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<28, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<29, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<30, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<31, T, D, BrdColConstant>,
+                    linearColumnFilter_caller<32, T, D, BrdColConstant>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 2, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 3, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 4, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 5, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 6, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 7, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 8, T, D, BrdColReflect>,
+                    linearColumnFilter_caller< 9, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<10, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<11, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<12, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<13, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<14, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<15, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<16, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<17, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<18, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<19, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<20, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<21, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<22, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<23, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<24, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<25, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<26, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<27, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<28, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<29, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<30, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<31, T, D, BrdColReflect>,
+                    linearColumnFilter_caller<32, T, D, BrdColReflect>
+                },
+                {
+                    0,
+                    linearColumnFilter_caller< 1, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 2, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 3, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 4, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 5, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 6, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 7, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 8, T, D, BrdColWrap>,
+                    linearColumnFilter_caller< 9, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<10, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<11, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<12, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<13, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<14, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<15, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<16, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<17, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<18, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<19, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<20, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<21, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<22, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<23, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<24, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<25, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<26, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<27, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<28, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<29, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<30, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<31, T, D, BrdColWrap>,
+                    linearColumnFilter_caller<32, T, D, BrdColWrap>
+                }
+            };
+
+            loadKernel(kernel, ksize, stream);
+
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        }
+
+        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+    } // namespace column_filter
+}}} // namespace cv { namespace gpu { namespace device
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -1,131 +1,131 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = src(y - top, x - left);
-        }
-
-        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
-        {
-            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
-                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
-            const T* borderValue, cudaStream_t stream)
-        {
-            typedef typename TypeVec<T, cn>::vec_type vec_type;
-
-            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
-
-            static const caller_t callers[5] =
-            {
-                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
-            };
-
-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
-        }
-
-        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-
-        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-
-        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+                dst.ptr(y)[x] = src(y - top, x - left);
+        }
+
+        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+        {
+            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
+                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
+
+                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+                cudaSafeCall( cudaGetLastError() );
+
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
+            const T* borderValue, cudaStream_t stream)
+        {
+            typedef typename TypeVec<T, cn>::vec_type vec_type;
+
+            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+
+            static const caller_t callers[5] =
+            {
+                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
+                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
+            };
+
+            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
+        }
+
+        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+
+        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+
+        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -1,151 +1,151 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
-//
-// The original code was written by Paul Furgale and Chi Hay Tong
-// and later optimized and prepared for integration into OpenCV by Itseez.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/sort.h>
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace gfft
-    {
-        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __device__ uint g_counter = 0;
-
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, uint max_count, int rows, int cols)
-        {
-            #if __CUDA_ARCH__ >= 110
-
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
-            {
-                float val = tex2D(eigTex, j, i);
-
-                if (val > threshold)
-                {
-                    float maxVal = val;
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
-
-                    if (val == maxVal)
-                    {
-                        const uint ind = atomicInc(&g_counter, (uint)(-1));
-
-                        if (ind < max_count)
-                            corners[ind] = make_float2(j, i);
-                    }
-                }
-            }
-
-            #endif // __CUDA_ARCH__ >= 110
-        }
-
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(uint)) );
-
-            bindTexture(&eigTex, eig);
-
-            dim3 block(16, 16);
-            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
-
-            if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
-            else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            uint count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
-
-            return min(count, max_count);
-        }
-
-        class EigGreater
-        {
-        public:
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
-            {
-                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
-            }
-        };
-
-
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
-        {
-            bindTexture(&eigTex, eig);
-
-            thrust::device_ptr<float2> ptr(corners);
-
-            thrust::sort(ptr, ptr + count, EigGreater());
-        }
-    } // namespace optical_flow
-}}}
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
+//
+// The original code was written by Paul Furgale and Chi Hay Tong
+// and later optimized and prepared for integration into OpenCV by Itseez.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/sort.h>
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace gfft
+    {
+        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __device__ uint g_counter = 0;
+
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, uint max_count, int rows, int cols)
+        {
+            #if __CUDA_ARCH__ >= 110
+
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
+            {
+                float val = tex2D(eigTex, j, i);
+
+                if (val > threshold)
+                {
+                    float maxVal = val;
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
+
+                    if (val == maxVal)
+                    {
+                        const uint ind = atomicInc(&g_counter, (uint)(-1));
+
+                        if (ind < max_count)
+                            corners[ind] = make_float2(j, i);
+                    }
+                }
+            }
+
+            #endif // __CUDA_ARCH__ >= 110
+        }
+
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(uint)) );
+
+            bindTexture(&eigTex, eig);
+
+            dim3 block(16, 16);
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
+
+            if (mask.data)
+                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+            else
+                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            uint count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(uint), cudaMemcpyDeviceToHost) );
+
+            return min(count, max_count);
+        }
+
+        class EigGreater
+        {
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
+                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
+            }
+        };
+
+
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        {
+            bindTexture(&eigTex, eig);
+
+            thrust::device_ptr<float2> ptr(corners);
+
+            thrust::sort(ptr, ptr + count, EigGreater());
+        }
+    } // namespace optical_flow
+}}}
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -1,224 +1,224 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    #define UINT_BITS 32U
-
-    //Warps == subhistograms per threadblock
-    #define WARP_COUNT 6
-
-    //Threadblock size
-    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
-    #define HISTOGRAM256_BIN_COUNT 256
-
-    //Shared memory per threadblock
-    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
-
-    #define PARTIAL_HISTOGRAM256_COUNT 240
-
-    #define MERGE_THREADBLOCK_SIZE 256
-
-    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
-
-    namespace hist
-    {
-        #if (!USE_SMEM_ATOMICS)
-
-            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
-
-            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
-            {
-                uint count;
-                do
-                {
-                    count = s_WarpHist[data] & TAG_MASK;
-                    count = threadTag | (count + 1);
-                    s_WarpHist[data] = count;
-                } while (s_WarpHist[data] != count);
-            }
-
-        #else
-
-            #define TAG_MASK 0xFFFFFFFFU
-
-            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
-            {
-                atomicAdd(s_WarpHist + data, 1);
-            }
-
-        #endif
-
-        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
-        {
-            uint x = pos_x << 2;
-
-            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
-        }
-
-        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-        {
-            //Per-warp subhistogram storage
-            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-
-            //Clear shared memory storage for current threadblock before processing
-            #pragma unroll
-            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-
-            //Cycle through the entire data set, update subhistograms for each warp
-            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
-
-            __syncthreads();
-            const uint colsui = d_Data.step / sizeof(uint);
-            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-            {
-                uint pos_y = pos / colsui;
-                uint pos_x = pos % colsui;
-                uint data = d_Data.ptr(pos_y)[pos_x];
-                addWord(s_WarpHist, data, tag, pos_x, cols);
-            }
-
-            //Merge per-warp histograms into per-block and write to global memory
-            __syncthreads();
-            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-            {
-                uint sum = 0;
-
-                for (uint i = 0; i < WARP_COUNT; i++)
-                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-            }
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////
-        // Merge histogram256() output
-        // Run one threadblock per bin; each threadblock adds up the same bin counter
-        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-        // takes only a fraction of total processing time
-        ////////////////////////////////////////////////////////////////////////////////
-
-        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
-        {
-            uint sum = 0;
-
-            #pragma unroll
-            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
-
-            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-            data[threadIdx.x] = sum;
-
-            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-            {
-                __syncthreads();
-                if(threadIdx.x < stride)
-                    data[threadIdx.x] += data[threadIdx.x + stride];
-            }
-
-            if(threadIdx.x == 0)
-                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
-        }
-
-        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
-        {
-            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                PtrStepSz<uint>(src),
-                buf,
-                static_cast<uint>(src.rows * src.step / sizeof(uint)),
-                src.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __constant__ int c_lut[256];
-
-        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < src.cols && y < src.rows)
-            {
-                const uchar val = src.ptr(y)[x];
-                const int lut = c_lut[val];
-                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-            }
-        }
-
-        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-        {
-            dim3 block(16, 16);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-
-            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace hist
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    #define UINT_BITS 32U
+
+    //Warps == subhistograms per threadblock
+    #define WARP_COUNT 6
+
+    //Threadblock size
+    #define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * OPENCV_GPU_WARP_SIZE)
+    #define HISTOGRAM256_BIN_COUNT 256
+
+    //Shared memory per threadblock
+    #define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
+
+    #define PARTIAL_HISTOGRAM256_COUNT 240
+
+    #define MERGE_THREADBLOCK_SIZE 256
+
+    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
+
+    namespace hist
+    {
+        #if (!USE_SMEM_ATOMICS)
+
+            #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
+
+            __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+            {
+                uint count;
+                do
+                {
+                    count = s_WarpHist[data] & TAG_MASK;
+                    count = threadTag | (count + 1);
+                    s_WarpHist[data] = count;
+                } while (s_WarpHist[data] != count);
+            }
+
+        #else
+
+            #define TAG_MASK 0xFFFFFFFFU
+
+            __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+            {
+                atomicAdd(s_WarpHist + data, 1);
+            }
+
+        #endif
+
+        __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
+        {
+            uint x = pos_x << 2;
+
+            if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
+            if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
+            if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
+            if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+        }
+
+        __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+        {
+            //Per-warp subhistogram storage
+            __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
+            uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
+
+            //Clear shared memory storage for current threadblock before processing
+            #pragma unroll
+            for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
+               s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
+
+            //Cycle through the entire data set, update subhistograms for each warp
+            const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
+
+            __syncthreads();
+            const uint colsui = d_Data.step / sizeof(uint);
+            for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
+            {
+                uint pos_y = pos / colsui;
+                uint pos_x = pos % colsui;
+                uint data = d_Data.ptr(pos_y)[pos_x];
+                addWord(s_WarpHist, data, tag, pos_x, cols);
+            }
+
+            //Merge per-warp histograms into per-block and write to global memory
+            __syncthreads();
+            for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
+            {
+                uint sum = 0;
+
+                for (uint i = 0; i < WARP_COUNT; i++)
+                    sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
+
+                d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
+            }
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////
+        // Merge histogram256() output
+        // Run one threadblock per bin; each threadblock adds up the same bin counter
+        // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
+        // takes only a fraction of total processing time
+        ////////////////////////////////////////////////////////////////////////////////
+
+        __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+        {
+            uint sum = 0;
+
+            #pragma unroll
+            for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
+                sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
+
+            __shared__ uint data[MERGE_THREADBLOCK_SIZE];
+            data[threadIdx.x] = sum;
+
+            for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
+            {
+                __syncthreads();
+                if(threadIdx.x < stride)
+                    data[threadIdx.x] += data[threadIdx.x + stride];
+            }
+
+            if(threadIdx.x == 0)
+                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+        }
+
+        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
+        {
+            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
+                PtrStepSz<uint>(src),
+                buf,
+                static_cast<uint>(src.rows * src.step / sizeof(uint)),
+                src.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __constant__ int c_lut[256];
+
+        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < src.cols && y < src.rows)
+            {
+                const uchar val = src.ptr(y)[x];
+                const int lut = c_lut[val];
+                dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
+            }
+        }
+
+        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+        {
+            dim3 block(16, 16);
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+
+            equalizeHist<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } // namespace hist
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -1,100 +1,100 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_internal_shared_HPP__
-#define __OPENCV_internal_shared_HPP__
-
-#include <cuda_runtime.h>
-#include <npp.h>
-#include "NPP_staging.hpp"
-#include "opencv2/gpu/devmem2d.hpp"
-#include "safe_call.hpp"
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu
-{
-    enum
-    {
-        BORDER_REFLECT101_GPU = 0,
-        BORDER_REPLICATE_GPU,
-        BORDER_CONSTANT_GPU,
-        BORDER_REFLECT_GPU,
-        BORDER_WRAP_GPU
-    };
-  
-    class NppStreamHandler
-    {
-    public:
-        inline explicit NppStreamHandler(cudaStream_t newStream = 0)
-        {
-            oldStream = nppGetStream();
-            nppSetStream(newStream);
-        }
-
-        inline ~NppStreamHandler()
-        {
-            nppSetStream(oldStream);
-        }
-
-    private:
-        cudaStream_t oldStream;
-    };
-
-    class NppStStreamHandler
-    {
-    public:
-        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
-        {
-            oldStream = nppStSetActiveCUDAstream(newStream);
-        }
-
-        inline ~NppStStreamHandler()
-        {
-            nppStSetActiveCUDAstream(oldStream);
-        }
-
-    private:
-        cudaStream_t oldStream;
-    };
-}}
-
-#endif /* __OPENCV_internal_shared_HPP__ */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_internal_shared_HPP__
+#define __OPENCV_internal_shared_HPP__
+
+#include <cuda_runtime.h>
+#include <npp.h>
+#include "NPP_staging.hpp"
+#include "opencv2/gpu/devmem2d.hpp"
+#include "safe_call.hpp"
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu
+{
+    enum
+    {
+        BORDER_REFLECT101_GPU = 0,
+        BORDER_REPLICATE_GPU,
+        BORDER_CONSTANT_GPU,
+        BORDER_REFLECT_GPU,
+        BORDER_WRAP_GPU
+    };
+
+    class NppStreamHandler
+    {
+    public:
+        inline explicit NppStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(newStream);
+        }
+
+        inline ~NppStreamHandler()
+        {
+            nppSetStream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+
+    class NppStStreamHandler
+    {
+    public:
+        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppStSetActiveCUDAstream(newStream);
+        }
+
+        inline ~NppStStreamHandler()
+        {
+            nppStSetActiveCUDAstream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+}}
+
+#endif /* __OPENCV_internal_shared_HPP__ */
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1,217 +1,217 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace mathfunc
-    {
-        //////////////////////////////////////////////////////////////////////////////////////
-        // Cart <-> Polar
-
-        struct Nothing
-        {
-            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
-            {
-            }
-        };
-        struct Magnitude
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
-            }
-        };
-        struct MagnitudeSqr
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-            }
-        };
-        struct Atan2
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-            {
-                float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0 * CV_PI;
-                dst[y * dst_step + x] = scale * angle;
-            }
-        };
-        template <typename Mag, typename Angle>
-        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
-                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-        {
-	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float x_data = xptr[y * x_step + x];
-                float y_data = yptr[y * y_step + x];
-
-                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-            }
-        }
-
-        struct NonEmptyMag
-        {
-            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
-            {
-                return mag[y * mag_step + x];
-            }
-        };
-        struct EmptyMag
-        {
-            static __device__ __forceinline__ float get(const float*, size_t, int, int)
-            {
-                return 1.0f;
-            }
-        };
-        template <typename Mag>
-        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-        {
-	        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-	        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float mag_data = Mag::get(mag, mag_step, x, y);
-                float angle_data = angle[y * angle_step + x];
-                float sin_a, cos_a;
-
-                ::sincosf(scale * angle_data, &sin_a, &cos_a);
-
-                xptr[y * x_step + x] = mag_data * cos_a;
-                yptr[y * y_step + x] = mag_data * sin_a;
-            }
-        }
-
-        template <typename Mag, typename Angle>
-        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(x.cols, threads.x);
-            grid.y = divUp(x.rows, threads.y);
-
-            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
-
-            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
-                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2][2][2] =
-            {
-                {
-                    {
-                        cartToPolar_caller<Magnitude, Atan2>,
-                        cartToPolar_caller<Magnitude, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<MagnitudeSqr, Atan2>,
-                        cartToPolar_caller<MagnitudeSqr, Nothing>,
-                    }
-                },
-                {
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>,
-                    }
-                }
-            };
-
-            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-        }
-
-        template <typename Mag>
-        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(mag.cols, threads.x);
-            grid.y = divUp(mag.rows, threads.y);
-
-            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
-
-            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
-                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2] =
-            {
-                polarToCart_caller<NonEmptyMag>,
-                polarToCart_caller<EmptyMag>
-            };
-
-            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-        }
-    } // namespace mathfunc
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace mathfunc
+    {
+        //////////////////////////////////////////////////////////////////////////////////////
+        // Cart <-> Polar
+
+        struct Nothing
+        {
+            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
+            {
+            }
+        };
+        struct Magnitude
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
+            }
+        };
+        struct MagnitudeSqr
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+            {
+                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
+            }
+        };
+        struct Atan2
+        {
+            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+            {
+                float angle = ::atan2f(y_data, x_data);
+                angle += (angle < 0) * 2.0 * CV_PI;
+                dst[y * dst_step + x] = scale * angle;
+            }
+        };
+        template <typename Mag, typename Angle>
+        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
+                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float x_data = xptr[y * x_step + x];
+                float y_data = yptr[y * y_step + x];
+
+                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
+            }
+        }
+
+        struct NonEmptyMag
+        {
+            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+            {
+                return mag[y * mag_step + x];
+            }
+        };
+        struct EmptyMag
+        {
+            static __device__ __forceinline__ float get(const float*, size_t, int, int)
+            {
+                return 1.0f;
+            }
+        };
+        template <typename Mag>
+        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < width && y < height)
+            {
+                float mag_data = Mag::get(mag, mag_step, x, y);
+                float angle_data = angle[y * angle_step + x];
+                float sin_a, cos_a;
+
+                ::sincosf(scale * angle_data, &sin_a, &cos_a);
+
+                xptr[y * x_step + x] = mag_data * cos_a;
+                yptr[y * y_step + x] = mag_data * sin_a;
+            }
+        }
+
+        template <typename Mag, typename Angle>
+        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(x.cols, threads.x);
+            grid.y = divUp(x.rows, threads.y);
+
+            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+
+            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
+                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2][2][2] =
+            {
+                {
+                    {
+                        cartToPolar_caller<Magnitude, Atan2>,
+                        cartToPolar_caller<Magnitude, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<MagnitudeSqr, Atan2>,
+                        cartToPolar_caller<MagnitudeSqr, Nothing>,
+                    }
+                },
+                {
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>
+                    },
+                    {
+                        cartToPolar_caller<Nothing, Atan2>,
+                        cartToPolar_caller<Nothing, Nothing>,
+                    }
+                }
+            };
+
+            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+        }
+
+        template <typename Mag>
+        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            dim3 threads(32, 8, 1);
+            dim3 grid(1, 1, 1);
+
+            grid.x = divUp(mag.cols, threads.x);
+            grid.y = divUp(mag.rows, threads.y);
+
+            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+
+            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
+                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
+            static const caller_t callers[2] =
+            {
+                polarToCart_caller<NonEmptyMag>,
+                polarToCart_caller<EmptyMag>
+            };
+
+            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+        }
+    } // namespace mathfunc
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -74,12 +74,12 @@ namespace cv { namespace gpu { namespace device

            const int i = blockDim.y * blockIdx.y + threadIdx.y;
            const int j = blockDim.x * blockIdx.x + threadIdx.x;
-            
+
            if (j >= dst.cols || i >= dst.rows)
                return;
-                                                
+
            int bsize = search_radius + block_radius;
-            int search_window = 2 * search_radius + 1;                        
+            int search_window = 2 * search_radius + 1;
            float minus_search_window2_inv = -1.f/(search_window * search_window);

            value_type sum1 = VecTraits<value_type>::all(0);
@@ -87,16 +87,16 @@ namespace cv { namespace gpu { namespace device

            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
            {
-                for(float y = -search_radius; y <= search_radius; ++y)                
+                for(float y = -search_radius; y <= search_radius; ++y)
                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {                    
+                    {
                        float dist2 = 0;
                        for(float ty = -block_radius; ty <= block_radius; ++ty)
                            for(float tx = -block_radius; tx <= block_radius; ++tx)
                            {
                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
-                                
+
                                dist2 += norm2(av - bv);
                            }

@@ -119,10 +119,10 @@ namespace cv { namespace gpu { namespace device
                            for(float tx = -block_radius; tx <= block_radius; ++tx)
                            {
                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
-                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));                                
+                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
                                dist2 += norm2(av - bv);
                            }
-                        
+
                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);

                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
@@ -144,9 +144,9 @@ namespace cv { namespace gpu { namespace device
            B<T> b(src.rows, src.cols);

            int block_window = 2 * block_radius + 1;
-            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);            
+            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
            float noise_mult = minus_h2_inv/(block_window * block_window);
-            
+
            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
            cudaSafeCall ( cudaGetLastError () );
@@ -160,7 +160,7 @@ namespace cv { namespace gpu { namespace device
        {
            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);

-            static func_t funcs[] = 
+            static func_t funcs[] =
            {
                nlm_caller<T, BrdReflect101>,
                nlm_caller<T, BrdReplicate>,
@@ -183,7 +183,7 @@ namespace cv { namespace gpu { namespace device
 namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
-    {  
+    {
        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
@@ -193,7 +193,7 @@ namespace cv { namespace gpu { namespace device
            enum
            {
                CTA_SIZE = 128,
-              
+
                TILE_COLS = 128,
                TILE_ROWS = 32,

@@ -217,7 +217,7 @@ namespace cv { namespace gpu { namespace device

            PtrStep<T> src;
            mutable PtrStepi buffer;
-            
+
            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
            {
                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
@@ -256,7 +256,7 @@ namespace cv { namespace gpu { namespace device
                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));

                            dist_sums[index] += dist;
-                            col_sums(tx + block_radius, index) += dist;                            
+                            col_sums(tx + block_radius, index) += dist;
                        }
 #endif

@@ -265,9 +265,9 @@ namespace cv { namespace gpu { namespace device
            }

            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {                          
+            {
                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {                                        
+                {
                    int y = index / search_window;
                    int x = index - y * search_window;

@@ -280,8 +280,8 @@ namespace cv { namespace gpu { namespace device
                    int col_sum = 0;

                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));                  
-                    
+                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
+
                    dist_sums[index] += col_sum - col_sums(first, index);

                    col_sums(first, index) = col_sum;
@@ -298,7 +298,7 @@ namespace cv { namespace gpu { namespace device
                T a_down = src(ay + block_radius, ax);

                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {                    
+                {
                    int y = index / search_window;
                    int x = index - y * search_window;

@@ -309,9 +309,9 @@ namespace cv { namespace gpu { namespace device
                    T b_down = src(by + block_radius, bx);

                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-                  
+
                    dist_sums[index] += col_sum  - col_sums(first, index);
-                    col_sums(first, index) = col_sum;                    
+                    col_sums(first, index) = col_sum;
                    up_col_sums(j, index) = col_sum;
                }
            }
@@ -339,14 +339,14 @@ namespace cv { namespace gpu { namespace device

                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
                }
-                
+
                volatile __shared__ float cta_buffer[CTA_SIZE];
-                
+
                int tid = threadIdx.x;

                cta_buffer[tid] = weights_sum;
                __syncthreads();
-                Block::reduce<CTA_SIZE>(cta_buffer, plus());                                
+                Block::reduce<CTA_SIZE>(cta_buffer, plus());
                weights_sum = cta_buffer[0];

                __syncthreads();
@@ -356,7 +356,7 @@ namespace cv { namespace gpu { namespace device
                {
                    cta_buffer[tid] = reinterpret_cast<float*>(&sum)[n];
                    __syncthreads();
-                    Block::reduce<CTA_SIZE>(cta_buffer, plus());                                                    
+                    Block::reduce<CTA_SIZE>(cta_buffer, plus());
                    reinterpret_cast<float*>(&sum)[n] = cta_buffer[0];

                    __syncthreads();
@@ -388,7 +388,7 @@ namespace cv { namespace gpu { namespace device

                for (int i = tby; i < tey; ++i)
                    for (int j = tbx; j < tex; ++j)
-                    {           
+                    {
                        __syncthreads();

                        if (j == tbx)
@@ -397,7 +397,7 @@ namespace cv { namespace gpu { namespace device
                            first = 0;
                        }
                        else
-                        {                            
+                        {
                            if (i == tby)
                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
                            else
@@ -407,7 +407,7 @@ namespace cv { namespace gpu { namespace device
                        }

                        __syncthreads();
-                        
+
                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
                    }
            }
@@ -418,7 +418,7 @@ namespace cv { namespace gpu { namespace device
        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }

        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-        {            
+        {
            typedef FastNonLocalMenas<uchar> FNLM;
            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));

@@ -434,13 +434,13 @@ namespace cv { namespace gpu { namespace device
            FNLM fnlm(search_window, block_window, h);

            fnlm.src = (PtrStepSz<T>)src;
-            fnlm.buffer = buffer;            
+            fnlm.buffer = buffer;

            dim3 block(FNLM::CTA_SIZE, 1);
            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
            int smem = search_window * search_window * sizeof(int);

-           
+
            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
            cudaSafeCall ( cudaGetLastError () );
            if (stream == 0)
@@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace device
        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);

-        
+

        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
        {
@@ -462,10 +462,10 @@ namespace cv { namespace gpu { namespace device
            {
                uchar3 p = lab(y, x);
                ab(y,x) = make_uchar2(p.y, p.z);
-                l(y,x) = p.x;                
+                l(y,x) = p.x;
            }
        }
-        
+
        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
        {
            dim3 b(32, 8);
@@ -485,7 +485,7 @@ namespace cv { namespace gpu { namespace device
            if (x < lab.cols && y < lab.rows)
            {
                uchar2 p = ab(y, x);
-                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);          
+                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
            }
        }

--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -1,220 +1,220 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace optical_flow
-    {
-        #define NEEDLE_MAP_SCALE 16
-        #define NUM_VERTS_PER_ARROW 6
-
-        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {
-            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
-
-            volatile float* u_col_sum = smem;
-            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
-
-            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
-            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
-
-            u_col_sum[threadIdx.x] = 0;
-            v_col_sum[threadIdx.x] = 0;
-
-            #pragma unroll
-            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
-            {
-                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
-                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
-            }
-
-            if (threadIdx.x < 8)
-            {
-                // now add the column sums
-                const uint X = threadIdx.x;
-
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
-                }
-
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
-                }
-
-                if (X | 0xf8 == 0xf8)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
-                }
-
-                if (X == 0)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
-                }
-            }
-
-            if (threadIdx.x == 0)
-            {
-                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
-
-                u_col_sum[0] *= coeff;
-                v_col_sum[0] *= coeff;
-
-                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
-                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
-            }
-        }
-
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
-        {
-            const dim3 block(NEEDLE_MAP_SCALE);
-            const dim3 grid(u_avg.cols, u_avg.rows);
-
-            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            // test - just draw a triangle at each pixel
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-
-            float3 v[NUM_VERTS_PER_ARROW];
-
-            if (x < u_avg.cols && y < u_avg.rows)
-            {
-                const float u_avg_val = u_avg(y, x);
-                const float v_avg_val = v_avg(y, x);
-
-                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
-
-                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
-                r = fmin(14.0f * (r / max_flow), 14.0f);
-
-                v[0].z = 1.0f;
-                v[1].z = 0.7f;
-                v[2].z = 0.7f;
-                v[3].z = 0.7f;
-                v[4].z = 0.7f;
-                v[5].z = 1.0f;
-
-                v[0].x = arrow_x;
-                v[0].y = arrow_y;
-                v[5].x = arrow_x;
-                v[5].y = arrow_y;
-
-                v[2].x = arrow_x + r * ::cosf(theta);
-                v[2].y = arrow_y + r * ::sinf(theta);
-                v[3].x = v[2].x;
-                v[3].y = v[2].y;
-
-                r = ::fmin(r, 2.5f);
-
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
-
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
-
-                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[0].x * xscale;
-                vertex_data[indx++] = v[0].y * yscale;
-                vertex_data[indx++] = v[0].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[1].x * xscale;
-                vertex_data[indx++] = v[1].y * yscale;
-                vertex_data[indx++] = v[1].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[2].x * xscale;
-                vertex_data[indx++] = v[2].y * yscale;
-                vertex_data[indx++] = v[2].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[3].x * xscale;
-                vertex_data[indx++] = v[3].y * yscale;
-                vertex_data[indx++] = v[3].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[4].x * xscale;
-                vertex_data[indx++] = v[4].y * yscale;
-                vertex_data[indx++] = v[4].z;
-
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
-                vertex_data[indx++] = v[5].x * xscale;
-                vertex_data[indx++] = v[5].y * yscale;
-                vertex_data[indx++] = v[5].z;
-            }
-        }
-
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            const dim3 block(16);
-            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
-
-            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace optical_flow
+    {
+        #define NEEDLE_MAP_SCALE 16
+        #define NUM_VERTS_PER_ARROW 6
+
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        {
+            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
+
+            volatile float* u_col_sum = smem;
+            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
+
+            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
+            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
+
+            u_col_sum[threadIdx.x] = 0;
+            v_col_sum[threadIdx.x] = 0;
+
+            #pragma unroll
+            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
+            {
+                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
+                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
+            }
+
+            if (threadIdx.x < 8)
+            {
+                // now add the column sums
+                const uint X = threadIdx.x;
+
+                if (X | 0xfe == 0xfe)  // bit 0 is 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
+                }
+
+                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
+                }
+
+                if (X | 0xf8 == 0xf8)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
+                }
+
+                if (X == 0)
+                {
+                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
+                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
+                }
+            }
+
+            if (threadIdx.x == 0)
+            {
+                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
+
+                u_col_sum[0] *= coeff;
+                v_col_sum[0] *= coeff;
+
+                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
+                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
+            }
+        }
+
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
+        {
+            const dim3 block(NEEDLE_MAP_SCALE);
+            const dim3 grid(u_avg.cols, u_avg.rows);
+
+            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            // test - just draw a triangle at each pixel
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
+
+            float3 v[NUM_VERTS_PER_ARROW];
+
+            if (x < u_avg.cols && y < u_avg.rows)
+            {
+                const float u_avg_val = u_avg(y, x);
+                const float v_avg_val = v_avg(y, x);
+
+                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
+
+                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
+                r = fmin(14.0f * (r / max_flow), 14.0f);
+
+                v[0].z = 1.0f;
+                v[1].z = 0.7f;
+                v[2].z = 0.7f;
+                v[3].z = 0.7f;
+                v[4].z = 0.7f;
+                v[5].z = 1.0f;
+
+                v[0].x = arrow_x;
+                v[0].y = arrow_y;
+                v[5].x = arrow_x;
+                v[5].y = arrow_y;
+
+                v[2].x = arrow_x + r * ::cosf(theta);
+                v[2].y = arrow_y + r * ::sinf(theta);
+                v[3].x = v[2].x;
+                v[3].y = v[2].y;
+
+                r = ::fmin(r, 2.5f);
+
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
+
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
+
+                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[0].x * xscale;
+                vertex_data[indx++] = v[0].y * yscale;
+                vertex_data[indx++] = v[0].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[1].x * xscale;
+                vertex_data[indx++] = v[1].y * yscale;
+                vertex_data[indx++] = v[1].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[2].x * xscale;
+                vertex_data[indx++] = v[2].y * yscale;
+                vertex_data[indx++] = v[2].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[3].x * xscale;
+                vertex_data[indx++] = v[3].y * yscale;
+                vertex_data[indx++] = v[3].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[4].x * xscale;
+                vertex_data[indx++] = v[4].y * yscale;
+                vertex_data[indx++] = v[4].z;
+
+                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                vertex_data[indx++] = v[5].x * xscale;
+                vertex_data[indx++] = v[5].y * yscale;
+                vertex_data[indx++] = v[5].z;
+            }
+        }
+
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        {
+            const dim3 block(16);
+            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
+
+            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -1,422 +1,422 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
-//
-// The original code was written by Paul Furgale and Chi Hay Tong
-// and later optimized and prepared for integration into OpenCV by Itseez.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/sort.h>
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/utility.hpp"
-#include "opencv2/gpu/device/functional.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace orb
-    {
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // cull
-
-        int cull_gpu(int* loc, float* response, int size, int n_points)
-        {
-            thrust::device_ptr<int> loc_ptr(loc);
-            thrust::device_ptr<float> response_ptr(response);
-
-            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
-
-            return n_points;
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // HarrisResponses
-
-        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
-        {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
-
-            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints)
-            {
-                const short2 loc = loc_[ptidx];
-
-                const int r = blockSize / 2;
-                const int x0 = loc.x - r;
-                const int y0 = loc.y - r;
-
-                int a = 0, b = 0, c = 0;
-
-                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
-                {
-                    const int i = ind / blockSize;
-                    const int j = ind % blockSize;
-
-                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
-                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
-                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
-
-                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
-                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
-                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
-
-                    a += Ix * Ix;
-                    b += Iy * Iy;
-                    c += Ix * Iy;
-                }
-
-                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
-                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
-
-                if (threadIdx.x == 0)
-                {
-                    float scale = (1 << 2) * blockSize * 255.0f;
-                    scale = 1.0f / scale;
-                    const float scale_sq_sq = scale * scale * scale * scale;
-
-                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
-                }
-            }
-        }
-
-        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.y);
-
-            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // IC_Angle
-
-        __constant__ int c_u_max[32];
-
-        void loadUMax(const int* u_max, int count)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
-        }
-
-        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
-        {
-            __shared__ int smem[8 * 32];
-
-            volatile int* srow = smem + threadIdx.y * blockDim.x;
-
-            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints)
-            {
-                int m_01 = 0, m_10 = 0;
-
-                const short2 loc = loc_[ptidx];
-
-                // Treat the center line differently, v=0
-                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
-                    m_10 += u * image(loc.y, loc.x + u);
-
-                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
-
-                for (int v = 1; v <= half_k; ++v)
-                {
-                    // Proceed over the two lines
-                    int v_sum = 0;
-                    int m_sum = 0;
-                    const int d = c_u_max[v];
-
-                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
-                    {
-                        int val_plus = image(loc.y + v, loc.x + u);
-                        int val_minus = image(loc.y - v, loc.x + u);
-
-                        v_sum += (val_plus - val_minus);
-                        m_sum += u * (val_plus + val_minus);
-                    }
-
-                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
-                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
-
-                    m_10 += m_sum;
-                    m_01 += v * v_sum;
-                }
-
-                if (threadIdx.x == 0)
-                {
-                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
-                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
-                    kp_dir *= 180.0f / CV_PI;
-
-                    angle[ptidx] = kp_dir;
-                }
-            }
-        }
-
-        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.y);
-
-            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // computeOrbDescriptor
-
-        template <int WTA_K> struct OrbDescriptor;
-
-        #define GET_VALUE(idx) \
-            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
-                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
-
-        template <> struct OrbDescriptor<2>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 16 * i;
-                pattern_y += 16 * i;
-
-                int t0, t1, val;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
-                val = t0 < t1;
-
-                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
-                val |= (t0 < t1) << 1;
-
-                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
-                val |= (t0 < t1) << 2;
-
-                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
-                val |= (t0 < t1) << 3;
-
-                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
-                val |= (t0 < t1) << 4;
-
-                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
-                val |= (t0 < t1) << 5;
-
-                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
-                val |= (t0 < t1) << 6;
-
-                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
-                val |= (t0 < t1) << 7;
-
-                return val;
-            }
-        };
-
-        template <> struct OrbDescriptor<3>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 12 * i;
-                pattern_y += 12 * i;
-
-                int t0, t1, t2, val;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
-                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
-
-                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
-
-                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
-
-                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
-
-                return val;
-            }
-        };
-
-        template <> struct OrbDescriptor<4>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 16 * i;
-                pattern_y += 16 * i;
-
-                int t0, t1, t2, t3, k, val;
-                int a, b;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
-                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val = k;
-
-                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
-                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 2;
-
-                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
-                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 4;
-
-                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
-                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 6;
-
-                return val;
-            }
-        };
-
-        #undef GET_VALUE
-
-        template <int WTA_K>
-        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
-            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
-        {
-            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints && descidx < dsize)
-            {
-                float angle = angle_[ptidx];
-                angle *= (float)(CV_PI / 180.f);
-
-                float sina, cosa;
-                ::sincosf(angle, &sina, &cosa);
-
-                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
-            }
-        }
-
-        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
-            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(dsize, block.x);
-            grid.y = divUp(npoints, block.y);
-
-            switch (WTA_K)
-            {
-            case 2:
-                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-
-            case 3:
-                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-
-            case 4:
-                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // mergeLocation
-
-        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
-        {
-            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (ptidx < npoints)
-            {
-                short2 loc = loc_[ptidx];
-
-                x[ptidx] = loc.x * scale;
-                y[ptidx] = loc.y * scale;
-            }
-        }
-
-        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
-        {
-            dim3 block(256);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.x);
-
-            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+// Copyright (c) 2010, Paul Furgale, Chi Hay Tong
+//
+// The original code was written by Paul Furgale and Chi Hay Tong
+// and later optimized and prepared for integration into OpenCV by Itseez.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/sort.h>
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/utility.hpp"
+#include "opencv2/gpu/device/functional.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace orb
+    {
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // cull
+
+        int cull_gpu(int* loc, float* response, int size, int n_points)
+        {
+            thrust::device_ptr<int> loc_ptr(loc);
+            thrust::device_ptr<float> response_ptr(response);
+
+            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
+
+            return n_points;
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // HarrisResponses
+
+        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
+        {
+            __shared__ int smem[8 * 32];
+
+            volatile int* srow = smem + threadIdx.y * blockDim.x;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                const short2 loc = loc_[ptidx];
+
+                const int r = blockSize / 2;
+                const int x0 = loc.x - r;
+                const int y0 = loc.y - r;
+
+                int a = 0, b = 0, c = 0;
+
+                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
+                {
+                    const int i = ind / blockSize;
+                    const int j = ind % blockSize;
+
+                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
+                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
+
+                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
+                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
+                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
+
+                    a += Ix * Ix;
+                    b += Iy * Iy;
+                    c += Ix * Iy;
+                }
+
+                reduce<32>(srow, a, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow, b, threadIdx.x, plus<volatile int>());
+                reduce<32>(srow, c, threadIdx.x, plus<volatile int>());
+
+                if (threadIdx.x == 0)
+                {
+                    float scale = (1 << 2) * blockSize * 255.0f;
+                    scale = 1.0f / scale;
+                    const float scale_sq_sq = scale * scale * scale * scale;
+
+                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
+                }
+            }
+        }
+
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // IC_Angle
+
+        __constant__ int c_u_max[32];
+
+        void loadUMax(const int* u_max, int count)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
+        }
+
+        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
+        {
+            __shared__ int smem[8 * 32];
+
+            volatile int* srow = smem + threadIdx.y * blockDim.x;
+
+            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints)
+            {
+                int m_01 = 0, m_10 = 0;
+
+                const short2 loc = loc_[ptidx];
+
+                // Treat the center line differently, v=0
+                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
+                    m_10 += u * image(loc.y, loc.x + u);
+
+                reduce<32>(srow, m_10, threadIdx.x, plus<volatile int>());
+
+                for (int v = 1; v <= half_k; ++v)
+                {
+                    // Proceed over the two lines
+                    int v_sum = 0;
+                    int m_sum = 0;
+                    const int d = c_u_max[v];
+
+                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
+                    {
+                        int val_plus = image(loc.y + v, loc.x + u);
+                        int val_minus = image(loc.y - v, loc.x + u);
+
+                        v_sum += (val_plus - val_minus);
+                        m_sum += u * (val_plus + val_minus);
+                    }
+
+                    reduce<32>(srow, v_sum, threadIdx.x, plus<volatile int>());
+                    reduce<32>(srow, m_sum, threadIdx.x, plus<volatile int>());
+
+                    m_10 += m_sum;
+                    m_01 += v * v_sum;
+                }
+
+                if (threadIdx.x == 0)
+                {
+                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
+                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
+                    kp_dir *= 180.0f / CV_PI;
+
+                    angle[ptidx] = kp_dir;
+                }
+            }
+        }
+
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.y);
+
+            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // computeOrbDescriptor
+
+        template <int WTA_K> struct OrbDescriptor;
+
+        #define GET_VALUE(idx) \
+            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
+                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
+
+        template <> struct OrbDescriptor<2>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                val = t0 < t1;
+
+                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
+                val |= (t0 < t1) << 1;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                val |= (t0 < t1) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
+                val |= (t0 < t1) << 3;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                val |= (t0 < t1) << 4;
+
+                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
+                val |= (t0 < t1) << 5;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                val |= (t0 < t1) << 6;
+
+                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
+                val |= (t0 < t1) << 7;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<3>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 12 * i;
+                pattern_y += 12 * i;
+
+                int t0, t1, t2, val;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
+                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
+
+                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
+
+                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
+
+                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
+                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
+
+                return val;
+            }
+        };
+
+        template <> struct OrbDescriptor<4>
+        {
+            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
+            {
+                pattern_x += 16 * i;
+                pattern_y += 16 * i;
+
+                int t0, t1, t2, t3, k, val;
+                int a, b;
+
+                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
+                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val = k;
+
+                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
+                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 2;
+
+                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
+                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 4;
+
+                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
+                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
+                a = 0, b = 2;
+                if( t1 > t0 ) t0 = t1, a = 1;
+                if( t3 > t2 ) t2 = t3, b = 3;
+                k = t0 > t2 ? a : b;
+                val |= k << 6;
+
+                return val;
+            }
+        };
+
+        #undef GET_VALUE
+
+        template <int WTA_K>
+        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
+        {
+            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
+            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (ptidx < npoints && descidx < dsize)
+            {
+                float angle = angle_[ptidx];
+                angle *= (float)(CV_PI / 180.f);
+
+                float sina, cosa;
+                ::sincosf(angle, &sina, &cosa);
+
+                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
+            }
+        }
+
+        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
+            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+
+            dim3 grid;
+            grid.x = divUp(dsize, block.x);
+            grid.y = divUp(npoints, block.y);
+
+            switch (WTA_K)
+            {
+            case 2:
+                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 3:
+                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+
+            case 4:
+                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////////////////////////////////////
+        // mergeLocation
+
+        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
+        {
+            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (ptidx < npoints)
+            {
+                short2 loc = loc_[ptidx];
+
+                x[ptidx] = loc.x * scale;
+                y[ptidx] = loc.y * scale;
+            }
+        }
+
+        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
+        {
+            dim3 block(256);
+
+            dim3 grid;
+            grid.x = divUp(npoints, block.x);
+
+            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -1,228 +1,228 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
-
-            __shared__ work_t smem[256 + 4];
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y;
-
-            const int src_y = 2 * y;
-
-            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, x);
-                    sum = sum + 0.25f   * src(src_y - 1, x);
-                    sum = sum + 0.375f  * src(src_y    , x);
-                    sum = sum + 0.25f   * src(src_y + 1, x);
-                    sum = sum + 0.0625f * src(src_y + 2, x);
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, left_x);
-                    sum = sum + 0.25f   * src(src_y - 1, left_x);
-                    sum = sum + 0.375f  * src(src_y    , left_x);
-                    sum = sum + 0.25f   * src(src_y + 1, left_x);
-                    sum = sum + 0.0625f * src(src_y + 2, left_x);
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, right_x);
-                    sum = sum + 0.25f   * src(src_y - 1, right_x);
-                    sum = sum + 0.375f  * src(src_y    , right_x);
-                    sum = sum + 0.25f   * src(src_y + 1, right_x);
-                    sum = sum + 0.0625f * src(src_y + 2, right_x);
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-            else
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-
-            __syncthreads();
-
-            if (threadIdx.x < 128)
-            {
-                const int tid2 = threadIdx.x * 2;
-
-                work_t sum;
-
-                sum =       0.0625f * smem[2 + tid2 - 2];
-                sum = sum + 0.25f   * smem[2 + tid2 - 1];
-                sum = sum + 0.375f  * smem[2 + tid2    ];
-                sum = sum + 0.25f   * smem[2 + tid2 + 1];
-                sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-                if (dst_x < dst_cols)
-                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-            }
-        }
-
-        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-            B<T> b(src.rows, src.cols);
-
-            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
+
+            __shared__ work_t smem[256 + 4];
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y;
+
+            const int src_y = 2 * y;
+
+            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, x);
+                    sum = sum + 0.25f   * src(src_y - 1, x);
+                    sum = sum + 0.375f  * src(src_y    , x);
+                    sum = sum + 0.25f   * src(src_y + 1, x);
+                    sum = sum + 0.0625f * src(src_y + 2, x);
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, left_x);
+                    sum = sum + 0.25f   * src(src_y - 1, left_x);
+                    sum = sum + 0.375f  * src(src_y    , left_x);
+                    sum = sum + 0.25f   * src(src_y + 1, left_x);
+                    sum = sum + 0.0625f * src(src_y + 2, left_x);
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(src_y - 2, right_x);
+                    sum = sum + 0.25f   * src(src_y - 1, right_x);
+                    sum = sum + 0.375f  * src(src_y    , right_x);
+                    sum = sum + 0.25f   * src(src_y + 1, right_x);
+                    sum = sum + 0.0625f * src(src_y + 2, right_x);
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+            else
+            {
+                {
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
+
+                    smem[2 + threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x < 2)
+                {
+                    const int left_x = x - 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
+
+                    smem[threadIdx.x] = sum;
+                }
+
+                if (threadIdx.x > 253)
+                {
+                    const int right_x = x + 2;
+
+                    work_t sum;
+
+                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
+                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
+                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
+                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
+
+                    smem[4 + threadIdx.x] = sum;
+                }
+            }
+
+            __syncthreads();
+
+            if (threadIdx.x < 128)
+            {
+                const int tid2 = threadIdx.x * 2;
+
+                work_t sum;
+
+                sum =       0.0625f * smem[2 + tid2 - 2];
+                sum = sum + 0.25f   * smem[2 + tid2 - 1];
+                sum = sum + 0.375f  * smem[2 + tid2    ];
+                sum = sum + 0.25f   * smem[2 + tid2 + 1];
+                sum = sum + 0.0625f * smem[2 + tid2 + 2];
+
+                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+                if (dst_x < dst_cols)
+                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
+            }
+        }
+
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(src.cols, block.x), dst.rows);
+
+            B<T> b(src.rows, src.cols);
+
+            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -1,196 +1,196 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            __shared__ sum_t s_srcPatch[10][10];
-            __shared__ sum_t s_dstPatch[20][16];
-
-            if (threadIdx.x < 10 && threadIdx.y < 10)
-            {
-                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
-                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
-
-                srcx = ::abs(srcx);
-                srcx = ::min(src.cols - 1, srcx);
-
-                srcy = ::abs(srcy);
-                srcy = ::min(src.rows - 1, srcy);
-
-                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
-            }
-
-            __syncthreads();
-
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
-            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
-            const bool eveny = ((threadIdx.y & 1) == 0);
-            const int tidx = threadIdx.x;
-
-            if (eveny)
-            {
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
-                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
-            }
-
-            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
-
-            if (threadIdx.y < 2)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
-            }
-
-            if (threadIdx.y > 13)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
-            }
-
-            __syncthreads();
-
-            sum = VecTraits<sum_t>::all(0);
-
-            const int tidy = threadIdx.y;
-
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
-            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
-
-            if (x < dst.cols && y < dst.rows)
-                dst(y, x) = saturate_cast<T>(4.0f * sum);
-        }
-
-        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            pyrUp<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            __shared__ sum_t s_srcPatch[10][10];
+            __shared__ sum_t s_dstPatch[20][16];
+
+            if (threadIdx.x < 10 && threadIdx.y < 10)
+            {
+                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
+                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
+
+                srcx = ::abs(srcx);
+                srcx = ::min(src.cols - 1, srcx);
+
+                srcy = ::abs(srcy);
+                srcy = ::min(src.rows - 1, srcy);
+
+                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
+            }
+
+            __syncthreads();
+
+            sum_t sum = VecTraits<sum_t>::all(0);
+
+            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
+            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
+            const bool eveny = ((threadIdx.y & 1) == 0);
+            const int tidx = threadIdx.x;
+
+            if (eveny)
+            {
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
+                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
+                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
+                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
+            }
+
+            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
+
+            if (threadIdx.y < 2)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
+            }
+
+            if (threadIdx.y > 13)
+            {
+                sum = VecTraits<sum_t>::all(0);
+
+                if (eveny)
+                {
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
+                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
+                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
+                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
+                }
+
+                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
+            }
+
+            __syncthreads();
+
+            sum = VecTraits<sum_t>::all(0);
+
+            const int tidy = threadIdx.y;
+
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
+            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
+            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
+            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
+
+            if (x < dst.cols && y < dst.rows)
+                dst(y, x) = saturate_cast<T>(4.0f * sum);
+        }
+
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
+        {
+            const dim3 block(16, 16);
+            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+            pyrUp<<<grid, block, 0, stream>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
+        {
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -1,274 +1,274 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = mapx.ptr(y)[x];
-                const float ycoo = mapy.ptr(y)[x];
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, int cc) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, int) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
-            {
-                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
-                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
-            }
-        };
-
-        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
-
-            static const caller_t callers[3][5] =
-            {
-                {
-                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
-                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
-        }
-
-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = mapx.ptr(y)[x];
+                const float ycoo = mapy.ptr(y)[x];
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, int)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
+            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_remap_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
+                    PtrStepSz< type > dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_remap_ ## type , srcWhole); \
+                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc);
+                else
+                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
+                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+
+            static const caller_t callers[3][5] =
+            {
+                {
+                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
+                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
+                    RemapDispatcher<PointFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
+                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
+                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
+                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
+                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
+                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+        }
+
+        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -1,302 +1,302 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-#include <cfloat>
-#include <opencv2/gpu/device/scan.hpp>
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
-
-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
-            }
-        }
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
-
-                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_resize_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type, srcWhole); \
-                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate< type > brd(src.rows, src.cols); \
-                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
-                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                if (stream == 0)
-                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
-                else
-                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> struct ResizeDispatcher<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                int iscale_x = (int)round(fx);
-                int iscale_y = (int)round(fy);
-
-                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
-                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
-                else
-                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-            PtrStepSzb dst, int interpolation, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
-
-            static const caller_t callers[4] =
-            {
-                ResizeDispatcher<PointFilter, T>::call,
-                ResizeDispatcher<LinearFilter, T>::call,
-                ResizeDispatcher<CubicFilter, T>::call,
-                ResizeDispatcher<AreaFilter, T>::call
-            };
-            // chenge to linear if area interpolation upscaling
-            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
-                interpolation = 1;
-
-            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template<typename T> struct scan_traits{};
-
-        template<> struct scan_traits<uchar>
-        {
-            typedef float scan_line_type;
-        };
-
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+#include <cfloat>
+#include <opencv2/gpu/device/scan.hpp>
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float xcoo = x * fx;
+                const float ycoo = y * fy;
+
+                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
+            }
+        }
+
+        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                dst(y, x) = saturate_cast<T>(src(y, x));
+            }
+        }
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
+
+                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                BrdConstant<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
+                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+                if (stream == 0)
+                    cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                BrdReplicate<T> brd(src.rows, src.cols);
+                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+
+                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
+            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_resize_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                const int xoff; \
+                const int yoff; \
+                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
+                } \
+            }; \
+            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_resize_ ## type, srcWhole); \
+                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate< type > brd(src.rows, src.cols); \
+                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
+                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
+                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
+
+        template <template <typename> class Filter, typename T> struct ResizeDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                if (stream == 0)
+                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
+                else
+                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> struct ResizeDispatcher<AreaFilter, T>
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
+            {
+                (void)srcWhole;
+                (void)xoff;
+                (void)yoff;
+                int iscale_x = (int)round(fx);
+                int iscale_y = (int)round(fy);
+
+                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
+                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
+                else
+                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
+            }
+        };
+
+        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
+            PtrStepSzb dst, int interpolation, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
+
+            static const caller_t callers[4] =
+            {
+                ResizeDispatcher<PointFilter, T>::call,
+                ResizeDispatcher<LinearFilter, T>::call,
+                ResizeDispatcher<CubicFilter, T>::call,
+                ResizeDispatcher<AreaFilter, T>::call
+            };
+            // chenge to linear if area interpolation upscaling
+            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
+                interpolation = 1;
+
+            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
+                static_cast< PtrStepSz<T> >(dst), stream);
+        }
+
+        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
+
+        template<typename T> struct scan_traits{};
+
+        template<> struct scan_traits<uchar>
+        {
+            typedef float scan_line_type;
+        };
+
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@@ -1,175 +1,175 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or bpied warranties, including, but not limited to, the bpied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/gpu/device/common.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace video_encoding
-    {
-        __device__ __forceinline__ void rgbtoy(const uchar b, const uchar g, const uchar r, uchar& y)
-        {
-            y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
-        }
-
-        __device__ __forceinline__ void rgbtoyuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
-        {
-            rgbtoy(b, g, r, y);
-            u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
-            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
-        }
-
-        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-            if (x + 1 >= src.cols || y + 1 >= src.rows)
-                return;
-
-            // get pointers to the data
-            const size_t planeSize = src.rows * dst.step;
-            PtrStepb y_plane(dst.data, dst.step);
-            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
-
-            uchar pix;
-            uchar y_val, u_val, v_val;
-
-            pix = src(y, x);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y, x) = y_val;
-
-            pix = src(y, x + 1);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y, x + 1) = y_val;
-
-            pix = src(y + 1, x);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y + 1, x) = y_val;
-
-            pix = src(y + 1, x + 1);
-            rgbtoyuv(pix, pix, pix, y_val, u_val, v_val);
-            y_plane(y + 1, x + 1) = y_val;
-            u_plane(y / 2, x / 2) = u_val;
-            v_plane(y / 2, x / 2) = v_val;
-        }
-
-        template <typename T>
-        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-            if (x + 1 >= src.cols || y + 1 >= src.rows)
-                return;
-
-            // get pointers to the data
-            const size_t planeSize = src.rows * dst.step;
-            PtrStepb y_plane(dst.data, dst.step);
-            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
-
-            T pix;
-            uchar y_val, u_val, v_val;
-
-            pix = src(y, x);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y, x) = y_val;
-
-            pix = src(y, x + 1);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y, x + 1) = y_val;
-
-            pix = src(y + 1, x);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y + 1, x) = y_val;
-
-            pix = src(y + 1, x + 1);
-            rgbtoyuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
-            y_plane(y + 1, x + 1) = y_val;
-            u_plane(y / 2, x / 2) = u_val;
-            v_plane(y / 2, x / 2) = v_val;
-        }
-
-        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-            Gray_to_YV12<<<grid, block>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <int cn>
-        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
-        {
-            typedef typename TypeVec<uchar, cn>::vec_type src_t;
-
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
-        {
-            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);
-
-            static const func_t funcs[] =
-            {
-                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
-            };
-
-            funcs[cn](src, dst);
-        }
-    }
-}}}
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/gpu/device/common.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace video_encoding
+    {
+        __device__ __forceinline__ void rgbtoy(const uchar b, const uchar g, const uchar r, uchar& y)
+        {
+            y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
+        }
+
+        __device__ __forceinline__ void rgbtoyuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
+        {
+            rgbtoy(b, g, r, y);
+            u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
+            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
+        }
+
+        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+            if (x + 1 >= src.cols || y + 1 >= src.rows)
+                return;
+
+            // get pointers to the data
+            const size_t planeSize = src.rows * dst.step;
+            PtrStepb y_plane(dst.data, dst.step);
+            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
+            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+
+            uchar pix;
+            uchar y_val, u_val, v_val;
+
+            pix = src(y, x);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y, x) = y_val;
+
+            pix = src(y, x + 1);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y, x + 1) = y_val;
+
+            pix = src(y + 1, x);
+            rgbtoy(pix, pix, pix, y_val);
+            y_plane(y + 1, x) = y_val;
+
+            pix = src(y + 1, x + 1);
+            rgbtoyuv(pix, pix, pix, y_val, u_val, v_val);
+            y_plane(y + 1, x + 1) = y_val;
+            u_plane(y / 2, x / 2) = u_val;
+            v_plane(y / 2, x / 2) = v_val;
+        }
+
+        template <typename T>
+        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
+        {
+            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
+            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
+
+            if (x + 1 >= src.cols || y + 1 >= src.rows)
+                return;
+
+            // get pointers to the data
+            const size_t planeSize = src.rows * dst.step;
+            PtrStepb y_plane(dst.data, dst.step);
+            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
+            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
+
+            T pix;
+            uchar y_val, u_val, v_val;
+
+            pix = src(y, x);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y, x) = y_val;
+
+            pix = src(y, x + 1);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y, x + 1) = y_val;
+
+            pix = src(y + 1, x);
+            rgbtoy(pix.z, pix.y, pix.x, y_val);
+            y_plane(y + 1, x) = y_val;
+
+            pix = src(y + 1, x + 1);
+            rgbtoyuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
+            y_plane(y + 1, x + 1) = y_val;
+            u_plane(y / 2, x / 2) = u_val;
+            v_plane(y / 2, x / 2) = v_val;
+        }
+
+        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+            Gray_to_YV12<<<grid, block>>>(src, dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+        template <int cn>
+        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
+        {
+            typedef typename TypeVec<uchar, cn>::vec_type src_t;
+
+            dim3 block(32, 8);
+            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
+
+            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
+        {
+            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);
+
+            static const func_t funcs[] =
+            {
+                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
+            };
+
+            funcs[cn](src, dst);
+        }
+    }
+}}}
+
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -1,387 +1,387 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/limits.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/static_check.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace row_filter
-    {
-        #define MAX_KERNEL_SIZE 32
-
-        __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
-        {
-            if (stream == 0)
-                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-            else
-                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-        }
-
-        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-        {
-            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 8;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #else
-                const int BLOCK_DIM_X = 32;
-                const int BLOCK_DIM_Y = 4;
-                const int PATCH_PER_BLOCK = 4;
-                const int HALO_SIZE = 1;
-            #endif
-
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-
-            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-            if (y >= src.rows)
-                return;
-
-            const T* src_row = src.ptr(y);
-
-            const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
-
-            if (blockIdx.x > 0)
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load left halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
-            }
-
-            if (blockIdx.x + 2 < gridDim.x)
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
-            }
-            else
-            {
-                //Load main data
-                #pragma unroll
-                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
-
-                //Load right halo
-                #pragma unroll
-                for (int j = 0; j < HALO_SIZE; ++j)
-                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-            {
-                const int x = xStart + j * BLOCK_DIM_X;
-
-                if (x < src.cols)
-                {
-                    sum_t sum = VecTraits<sum_t>::all(0);
-
-                    #pragma unroll
-                    for (int k = 0; k < KSIZE; ++k)
-                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
-
-                    dst(y, x) = saturate_cast<D>(sum);
-                }
-            }
-        }
-
-        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-        {
-            int BLOCK_DIM_X;
-            int BLOCK_DIM_Y;
-            int PATCH_PER_BLOCK;
-
-            if (cc >= 20)
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 8;
-                PATCH_PER_BLOCK = 4;
-            }
-            else
-            {
-                BLOCK_DIM_X = 32;
-                BLOCK_DIM_Y = 4;
-                PATCH_PER_BLOCK = 4;
-            }
-
-            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-            const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
-
-            B<T> brd(src.cols);
-
-            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T, typename D>
-        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-            static const caller_t callers[5][33] =
-            {
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect101>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect101>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<10, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<11, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<12, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<13, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<15, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<16, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<17, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<18, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<19, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<20, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<21, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<22, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<23, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<24, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<25, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<26, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<27, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<28, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<29, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<30, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<31, T, D, BrdRowReplicate>,
-                    linearRowFilter_caller<32, T, D, BrdRowReplicate>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 2, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 3, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 4, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 5, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 6, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 7, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 8, T, D, BrdRowConstant>,
-                    linearRowFilter_caller< 9, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<10, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<11, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<12, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<15, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<16, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<17, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<18, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<19, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<20, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<21, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<22, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<23, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<24, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<25, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<26, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<27, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<28, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<29, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<30, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<31, T, D, BrdRowConstant>,
-                    linearRowFilter_caller<32, T, D, BrdRowConstant>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 2, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 3, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 4, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 5, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 6, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 7, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 8, T, D, BrdRowReflect>,
-                    linearRowFilter_caller< 9, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<10, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<11, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<12, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<15, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<16, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<17, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<18, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<19, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<20, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<21, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<22, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<23, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<24, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<25, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<26, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<27, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<28, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<29, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<30, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<31, T, D, BrdRowReflect>,
-                    linearRowFilter_caller<32, T, D, BrdRowReflect>
-                },
-                {
-                    0,
-                    linearRowFilter_caller< 1, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 2, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 3, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 4, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 5, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 6, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 7, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 8, T, D, BrdRowWrap>,
-                    linearRowFilter_caller< 9, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<10, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<11, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<12, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<15, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<16, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<17, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<18, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<19, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<20, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<21, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<22, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<23, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<24, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<25, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<26, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<27, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<28, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<29, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
-                    linearRowFilter_caller<32, T, D, BrdRowWrap>
-                }
-            };
-
-            loadKernel(kernel, ksize, stream);
-
-            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-        }
-
-        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-    } // namespace row_filter
-}}} // namespace cv { namespace gpu { namespace device
-
-
-#endif /* CUDA_DISABLER */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/limits.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/static_check.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace row_filter
+    {
+        #define MAX_KERNEL_SIZE 32
+
+        __constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+        void loadKernel(const float* kernel, int ksize, cudaStream_t stream)
+        {
+            if (stream == 0)
+                cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
+            else
+                cudaSafeCall( cudaMemcpyToSymbolAsync(c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
+        }
+
+        template <int KSIZE, typename T, typename D, typename B>
+        __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        {
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
+                const int BLOCK_DIM_X = 32;
+                const int BLOCK_DIM_Y = 8;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = 1;
+            #else
+                const int BLOCK_DIM_X = 32;
+                const int BLOCK_DIM_Y = 4;
+                const int PATCH_PER_BLOCK = 4;
+                const int HALO_SIZE = 1;
+            #endif
+
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+            __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
+
+            const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+            if (y >= src.rows)
+                return;
+
+            const T* src_row = src.ptr(y);
+
+            const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
+
+            if (blockIdx.x > 0)
+            {
+                //Load left halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
+            }
+            else
+            {
+                //Load left halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
+            }
+
+            if (blockIdx.x + 2 < gridDim.x)
+            {
+                //Load main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
+
+                //Load right halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
+            }
+            else
+            {
+                //Load main data
+                #pragma unroll
+                for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+                    smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
+
+                //Load right halo
+                #pragma unroll
+                for (int j = 0; j < HALO_SIZE; ++j)
+                    smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
+            }
+
+            __syncthreads();
+
+            #pragma unroll
+            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
+            {
+                const int x = xStart + j * BLOCK_DIM_X;
+
+                if (x < src.cols)
+                {
+                    sum_t sum = VecTraits<sum_t>::all(0);
+
+                    #pragma unroll
+                    for (int k = 0; k < KSIZE; ++k)
+                        sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+
+                    dst(y, x) = saturate_cast<D>(sum);
+                }
+            }
+        }
+
+        template <int KSIZE, typename T, typename D, template<typename> class B>
+        void linearRowFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+        {
+            int BLOCK_DIM_X;
+            int BLOCK_DIM_Y;
+            int PATCH_PER_BLOCK;
+
+            if (cc >= 20)
+            {
+                BLOCK_DIM_X = 32;
+                BLOCK_DIM_Y = 8;
+                PATCH_PER_BLOCK = 4;
+            }
+            else
+            {
+                BLOCK_DIM_X = 32;
+                BLOCK_DIM_Y = 4;
+                PATCH_PER_BLOCK = 4;
+            }
+
+            const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+            const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
+
+            B<T> brd(src.cols);
+
+            linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T, typename D>
+        void linearRowFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+
+            static const caller_t callers[5][33] =
+            {
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 2, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 3, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 4, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 5, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 6, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 7, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 8, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller< 9, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<10, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<11, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<12, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<13, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<16, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<17, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<18, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<19, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<20, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<21, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<22, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<23, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<24, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<25, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<26, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<27, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<28, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<29, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<30, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<31, T, D, BrdRowReflect101>,
+                    linearRowFilter_caller<32, T, D, BrdRowReflect101>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 2, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 3, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 4, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 5, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 6, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 7, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 8, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller< 9, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<10, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<11, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<12, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<13, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<15, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<16, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<17, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<18, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<19, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<20, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<21, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<22, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<23, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<24, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<25, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<26, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<27, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<28, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<29, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<30, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<31, T, D, BrdRowReplicate>,
+                    linearRowFilter_caller<32, T, D, BrdRowReplicate>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 2, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 3, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 4, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 5, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 6, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 7, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 8, T, D, BrdRowConstant>,
+                    linearRowFilter_caller< 9, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<10, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<11, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<12, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<13, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<14, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<15, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<16, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<17, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<18, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<19, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<20, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<21, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<22, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<23, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<24, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<25, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<26, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<27, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<28, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<29, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<30, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<31, T, D, BrdRowConstant>,
+                    linearRowFilter_caller<32, T, D, BrdRowConstant>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 2, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 3, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 4, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 5, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 6, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 7, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 8, T, D, BrdRowReflect>,
+                    linearRowFilter_caller< 9, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<10, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<11, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<12, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<13, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<14, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<15, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<16, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<17, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<18, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<19, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<20, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<21, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<22, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<23, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<24, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<25, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<26, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<27, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<28, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<29, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<30, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<31, T, D, BrdRowReflect>,
+                    linearRowFilter_caller<32, T, D, BrdRowReflect>
+                },
+                {
+                    0,
+                    linearRowFilter_caller< 1, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 2, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 3, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 4, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 5, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 6, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 7, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 8, T, D, BrdRowWrap>,
+                    linearRowFilter_caller< 9, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<10, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<11, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<12, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<13, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<14, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<15, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<16, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<17, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<18, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<19, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<20, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<21, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<22, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<23, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<24, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<25, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<26, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<27, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<28, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<29, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<30, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<31, T, D, BrdRowWrap>,
+                    linearRowFilter_caller<32, T, D, BrdRowWrap>
+                }
+            };
+
+            loadKernel(kernel, ksize, stream);
+
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        }
+
+        template void linearRowFilter_gpu<uchar , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<int   , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearRowFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+    } // namespace row_filter
+}}} // namespace cv { namespace gpu { namespace device
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -1,95 +1,95 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
-#define __OPENCV_CUDA_SAFE_CALL_HPP__
-
-#include <cuda_runtime_api.h>
-#include <cufft.h>
-#include <cublas.h>
-#include "NCV.hpp"
-
-#if defined(__GNUC__)
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
-    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-namespace cv { namespace gpu
-{
-    void nppError(int err, const char *file, const int line, const char *func = "");
-    void ncvError(int err, const char *file, const int line, const char *func = "");
-    void cufftError(int err, const char *file, const int line, const char *func = "");
-    void cublasError(int err, const char *file, const int line, const char *func = "");
-}}
-
-static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (err < 0)
-        cv::gpu::nppError(err, file, line, func);
-}
-
-static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (NCV_SUCCESS != err)
-        cv::gpu::ncvError(err, file, line, func);
-}
-
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}
-
-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
-
-#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
+#define __OPENCV_CUDA_SAFE_CALL_HPP__
+
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <cublas.h>
+#include "NCV.hpp"
+
+#if defined(__GNUC__)
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define nppSafeCall(expr)  ___nppSafeCall(expr, __FILE__, __LINE__)
+    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
+    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
+    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
+#endif
+
+namespace cv { namespace gpu
+{
+    void nppError(int err, const char *file, const int line, const char *func = "");
+    void ncvError(int err, const char *file, const int line, const char *func = "");
+    void cufftError(int err, const char *file, const int line, const char *func = "");
+    void cublasError(int err, const char *file, const int line, const char *func = "");
+}}
+
+static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+        cv::gpu::nppError(err, file, line, func);
+}
+
+static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (NCV_SUCCESS != err)
+        cv::gpu::ncvError(err, file, line, func);
+}
+
+static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUFFT_SUCCESS != err)
+        cv::gpu::cufftError(err, file, line, func);
+}
+
+static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUBLAS_STATUS_SUCCESS != err)
+        cv::gpu::cublasError(err, file, line, func);
+}
+
+#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -56,14 +56,14 @@ namespace cv
      template<class T, enum cudaTextureReadMode readMode>
      TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
      {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
      }
-      
+
      template<class T, enum cudaTextureReadMode readMode>
      TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
      {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();  
+        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
        cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
      }

--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -1,389 +1,389 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "internal_shared.hpp"
-#include "opencv2/gpu/device/border_interpolate.hpp"
-#include "opencv2/gpu/device/vec_traits.hpp"
-#include "opencv2/gpu/device/vec_math.hpp"
-#include "opencv2/gpu/device/saturate_cast.hpp"
-#include "opencv2/gpu/device/filters.hpp"
-
-namespace cv { namespace gpu { namespace device
-{
-    namespace imgproc
-    {
-        __constant__ float c_warpMat[3 * 3];
-
-        struct AffineTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        struct PerspectiveTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-
-                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////
-        // Build Maps
-
-        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < xmap.cols && y < xmap.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                xmap(y, x) = coord.x;
-                ymap(y, x) = coord.y;
-            }
-        }
-
-        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-
-            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
-        }
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////////
-        // Warp
-
-        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
-            }
-        }
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
-            {
-                (void)xoff;
-                (void)yoff;
-                (void)srcWhole;
-
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc >= 20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
-            {
-                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
-                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
-            }
-        };
-
-        template <class Transform, typename T>
-        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
-
-            static const func_t funcs[3][5] =
-            {
-                {
-                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
-        }
-
-        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
-        }
-
-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
-        }
-
-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-
-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace device
-
-
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "internal_shared.hpp"
+#include "opencv2/gpu/device/border_interpolate.hpp"
+#include "opencv2/gpu/device/vec_traits.hpp"
+#include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/filters.hpp"
+
+namespace cv { namespace gpu { namespace device
+{
+    namespace imgproc
+    {
+        __constant__ float c_warpMat[3 * 3];
+
+        struct AffineTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
+                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        struct PerspectiveTransform
+        {
+            static __device__ __forceinline__ float2 calcCoord(int x, int y)
+            {
+                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
+
+                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
+                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
+
+                return make_float2(xcoo, ycoo);
+            }
+        };
+
+        ///////////////////////////////////////////////////////////////////
+        // Build Maps
+
+        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < xmap.cols && y < xmap.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                xmap(y, x) = coord.x;
+                ymap(y, x) = coord.y;
+            }
+        }
+
+        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
+
+            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
+        }
+
+        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
+        }
+
+        ///////////////////////////////////////////////////////////////////
+        // Warp
+
+        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
+        {
+            const int x = blockDim.x * blockIdx.x + threadIdx.x;
+            const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                const float2 coord = Transform::calcCoord(x, y);
+
+                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
+            }
+        }
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+            }
+        };
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, int)
+            {
+                (void)xoff;
+                (void)yoff;
+                (void)srcWhole;
+
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+
+                warp<Transform><<<grid, block>>>(filter_src, dst);
+                cudaSafeCall( cudaGetLastError() );
+
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };
+
+        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
+            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
+            struct tex_warp_ ## type ## _reader \
+            { \
+                typedef type elem_type; \
+                typedef int index_type; \
+                int xoff, yoff; \
+                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
+                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
+                { \
+                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, int cc) \
+                { \
+                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
+                    dim3 block(32, cc >= 20 ? 8 : 4); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
+                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
+                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
+                    warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            }; \
+            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
+            { \
+                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, int) \
+                { \
+                    dim3 block(32, 8); \
+                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+                    bindTexture(&tex_warp_ ## type , srcWhole); \
+                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
+                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
+                    { \
+                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    else \
+                    { \
+                        BrdReplicate<type> brd(src.rows, src.cols); \
+                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
+                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
+                        warp<Transform><<<grid, block>>>(filter_src, dst); \
+                    } \
+                    cudaSafeCall( cudaGetLastError() ); \
+                    cudaSafeCall( cudaDeviceSynchronize() ); \
+                } \
+            };
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
+
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
+
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
+        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
+        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
+
+        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
+
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc)
+            {
+                if (stream == 0)
+                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc);
+                else
+                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc);
+            }
+        };
+
+        template <class Transform, typename T>
+        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
+                         int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, int cc);
+
+            static const func_t funcs[3][5] =
+            {
+                {
+                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
+                },
+                {
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
+                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
+                }
+            };
+
+            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
+                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc);
+        }
+
+        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
+
+            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+        }
+
+        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
+                                                  int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
+
+            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc);
+        }
+
+        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+
+        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+    } // namespace imgproc
+}}} // namespace cv { namespace gpu { namespace device
+
+
 #endif /* CUDA_DISABLER */