gpuimgproc module for image processing

2013-04-17 18:14:35 +04:00
parent d569e72ad4
commit e41aea0acf
66 changed files with 889 additions and 404 deletions
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -1,99 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace blend
-    {
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::blend;
-
-void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
-                          GpuMat& result, Stream& stream)
-{
-    CV_Assert(img1.size() == img2.size());
-    CV_Assert(img1.type() == img2.type());
-    CV_Assert(weights1.size() == img1.size());
-    CV_Assert(weights2.size() == img2.size());
-    CV_Assert(weights1.type() == CV_32F);
-    CV_Assert(weights2.type() == CV_32F);
-
-    const Size size = img1.size();
-    const int depth = img1.depth();
-    const int cn = img1.channels();
-
-    result.create(size, CV_MAKE_TYPE(depth, cn));
-
-    switch (depth)
-    {
-    case CV_8U:
-        if (cn != 4)
-            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        else
-            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    case CV_32F:
-        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    default:
-        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
-    }
-}
-
-#endif
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -1,199 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-/// Bilateral filtering
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
-        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
-        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
-        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
-
-        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
-
-        template<typename T, typename B>
-        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= src.cols || y >= src.rows)
-                return;
-
-            value_type center = saturate_cast<value_type>(src(y, x));
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0;
-
-            int r = ksz / 2;
-            float r2 = (float)(r * r);
-
-            int tx = x - r + ksz;
-            int ty = y - r + ksz;
-
-            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(src(cy, cx));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            else
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            dst(y, x) = saturate_cast<T>(sum1 / sum2);
-        }
-
-        template<typename T, template <typename> class B>
-        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
-             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
-
-            static caller_t funcs[] =
-            {
-                bilateral_caller<T, BrdReflect101>,
-                bilateral_caller<T, BrdReplicate>,
-                bilateral_caller<T, BrdConstant>,
-                bilateral_caller<T, BrdReflect>,
-                bilateral_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
-        }
-    }
-}}}
-
-
-#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
-    template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
-
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(short)
-//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
-OCV_INSTANTIATE_BILATERAL_FILTER(short3)
-OCV_INSTANTIATE_BILATERAL_FILTER(short4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
-//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(int)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(float)
-//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
-OCV_INSTANTIATE_BILATERAL_FILTER(float3)
-OCV_INSTANTIATE_BILATERAL_FILTER(float4)
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -1,121 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace blend
-    {
-        template <typename T>
-        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                int x_ = x / cn;
-                float w1 = weights1.ptr(y)[x_];
-                float w2 = weights2.ptr(y)[x_];
-                T p1 = img1.ptr(y)[x];
-                T p2 = img2.ptr(y)[x];
-                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-            }
-        }
-
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
-
-
-        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                float w1 = weights1.ptr(y)[x];
-                float w2 = weights2.ptr(y)[x];
-                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-                w1 *= sum_inv;
-                w2 *= sum_inv;
-                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-            }
-        }
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace blend
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -1,494 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <utility>
-#include <algorithm>//std::swap
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace canny
-{
-    struct L1 : binary_function<int, int, float>
-    {
-        __device__ __forceinline__ float operator ()(int x, int y) const
-        {
-            return ::abs(x) + ::abs(y);
-        }
-
-        __device__ __forceinline__ L1() {}
-        __device__ __forceinline__ L1(const L1&) {}
-    };
-    struct L2 : binary_function<int, int, float>
-    {
-        __device__ __forceinline__ float operator ()(int x, int y) const
-        {
-            return ::sqrtf(x * x + y * y);
-        }
-
-        __device__ __forceinline__ L2() {}
-        __device__ __forceinline__ L2(const L2&) {}
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace canny
-{
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTex
-    {
-        const int xoff;
-        const int yoff;
-        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
-
-        __device__ __forceinline__ int operator ()(int y, int x) const
-        {
-            return tex2D(tex_src, x + xoff, y + yoff);
-        }
-    };
-
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y >= mag.rows || x >= mag.cols)
-            return;
-
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
-
-        dx(y, x) = dxVal;
-        dy(y, x) = dyVal;
-
-        mag(y, x) = norm(dxVal, dyVal);
-    }
-
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
-
-        bindTexture(&tex_src, srcWhole);
-        SrcTex src(xoff, yoff);
-
-        if (L2Grad)
-        {
-            L2 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
-        }
-        else
-        {
-            L1 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
-        }
-
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall(cudaThreadSynchronize());
-    }
-
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
-    {
-        if (L2Grad)
-        {
-            L2 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
-        }
-        else
-        {
-            L1 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
-    {
-        const int CANNY_SHIFT = 15;
-        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
-            return;
-
-        int dxVal = dx(y, x);
-        int dyVal = dy(y, x);
-
-        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D(tex_mag, x, y);
-
-        dxVal = ::abs(dxVal);
-        dyVal = ::abs(dyVal);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-
-        if (m > low_thresh)
-        {
-            const int tg22x = dxVal * TG22;
-            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
-
-            dyVal <<= CANNY_SHIFT;
-
-            if (dyVal < tg22x)
-            {
-                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if(dyVal > tg67x)
-            {
-                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-        }
-
-        map(y, x) = edge_type;
-    }
-
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
-
-        bindTexture(&tex_mag, mag);
-
-        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    __device__ int counter = 0;
-
-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
-    {
-        __shared__ volatile int smem[18][18];
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
-        if (threadIdx.y == 0)
-            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
-        if (threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
-        if (threadIdx.x == 0)
-            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1)
-            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
-        if (threadIdx.x == 0 && threadIdx.y == 0)
-            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
-            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
-        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
-
-        __syncthreads();
-
-        if (x >= map.cols || y >= map.rows)
-            return;
-
-        int n;
-
-        #pragma unroll
-        for (int k = 0; k < 16; ++k)
-        {
-            n = 0;
-
-            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-            {
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-            }
-
-            if (n > 0)
-                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-        }
-
-        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-        map(y, x) = e;
-
-        n = 0;
-
-        if (e == 2)
-        {
-            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-
-            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-
-            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-        }
-
-        if (n > 0)
-        {
-            const int ind =  ::atomicAdd(&counter, 1);
-            st[ind] = make_ushort2(x, y);
-        }
-    }
-
-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
-    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
-
-        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
-    {
-        const int stack_size = 512;
-
-        __shared__ int s_counter;
-        __shared__ int s_ind;
-        __shared__ ushort2 s_st[stack_size];
-
-        if (threadIdx.x == 0)
-            s_counter = 0;
-
-        __syncthreads();
-
-        int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-        if (ind >= count)
-            return;
-
-        ushort2 pos = st1[ind];
-
-        if (threadIdx.x < 8)
-        {
-            pos.x += c_dx[threadIdx.x];
-            pos.y += c_dy[threadIdx.x];
-
-            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
-            {
-                map(pos.y, pos.x) = 2;
-
-                ind = Emulation::smem::atomicAdd(&s_counter, 1);
-
-                s_st[ind] = pos;
-            }
-        }
-
-        __syncthreads();
-
-        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
-        {
-            const int subTaskIdx = threadIdx.x >> 3;
-            const int portion = ::min(s_counter, blockDim.x >> 3);
-
-            if (subTaskIdx < portion)
-                pos = s_st[s_counter - 1 - subTaskIdx];
-
-            __syncthreads();
-
-            if (threadIdx.x == 0)
-                s_counter -= portion;
-
-            __syncthreads();
-
-            if (subTaskIdx < portion)
-            {
-                pos.x += c_dx[threadIdx.x & 7];
-                pos.y += c_dy[threadIdx.x & 7];
-
-                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
-                {
-                    map(pos.y, pos.x) = 2;
-
-                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
-
-                    s_st[ind] = pos;
-                }
-            }
-
-            __syncthreads();
-        }
-
-        if (s_counter > 0)
-        {
-            if (threadIdx.x == 0)
-            {
-                ind = ::atomicAdd(&counter, s_counter);
-                s_ind = ind - s_counter;
-            }
-
-            __syncthreads();
-
-            ind = s_ind;
-
-            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                st2[ind + i] = s_st[i];
-        }
-    }
-
-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
-    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
-
-        int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-        while (count > 0)
-        {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-            const dim3 block(128);
-            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
-
-            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            std::swap(st1, st2);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    struct GetEdges : unary_function<int, uchar>
-    {
-        __device__ __forceinline__ uchar operator ()(int e) const
-        {
-            return (uchar)(-(e >> 1));
-        }
-
-        __device__ __forceinline__ GetEdges() {}
-        __device__ __forceinline__ GetEdges(const GetEdges&) {}
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace canny
-{
-    void getEdges(PtrStepSzi map, PtrStepSzb dst)
-    {
-        transform(map, dst, GetEdges(), WithOutMask(), 0);
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -1,534 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <opencv2/core/cuda/common.hpp>
-#include <opencv2/core/cuda/vec_traits.hpp>
-#include <opencv2/core/cuda/vec_math.hpp>
-#include <opencv2/core/cuda/emulation.hpp>
-
-#include <iostream>
-#include <stdio.h>
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace ccl
-    {
-        enum
-        {
-            WARP_SIZE  = 32,
-            WARP_LOG   = 5,
-
-            CTA_SIZE_X = 32,
-            CTA_SIZE_Y = 8,
-
-            STA_SIZE_MERGE_Y = 4,
-            STA_SIZE_MERGE_X = 32,
-
-            TPB_X = 1,
-            TPB_Y = 4,
-
-            TILE_COLS = CTA_SIZE_X * TPB_X,
-            TILE_ROWS = CTA_SIZE_Y * TPB_Y
-        };
-
-        template<typename T> struct IntervalsTraits
-        {
-            typedef T elem_type;
-        };
-
-        template<> struct IntervalsTraits<unsigned char>
-        {
-            typedef int dist_type;
-            enum {ch = 1};
-        };
-
-        template<> struct IntervalsTraits<uchar3>
-        {
-            typedef int3 dist_type;
-            enum {ch = 3};
-        };
-
-        template<> struct IntervalsTraits<uchar4>
-        {
-            typedef int4 dist_type;
-            enum {ch = 4};
-        };
-
-        template<> struct IntervalsTraits<unsigned short>
-        {
-            typedef int dist_type;
-            enum {ch = 1};
-        };
-
-        template<> struct IntervalsTraits<ushort3>
-        {
-            typedef int3 dist_type;
-            enum {ch = 3};
-        };
-
-        template<> struct IntervalsTraits<ushort4>
-        {
-            typedef int4 dist_type;
-            enum {ch = 4};
-        };
-
-        template<> struct IntervalsTraits<float>
-        {
-            typedef float dist_type;
-            enum {ch = 1};
-        };
-
-        template<> struct IntervalsTraits<int>
-        {
-            typedef int dist_type;
-            enum {ch = 1};
-        };
-
-        typedef unsigned char component;
-        enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
-
-        template<typename T, int CH> struct InInterval {};
-
-        template<typename T> struct InInterval<T, 1>
-        {
-            typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) {};
-            T lo, hi;
-
-            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
-            {
-                I d = a - b;
-                return lo <= d && d <= hi;
-            }
-        };
-
-
-        template<typename T> struct InInterval<T, 3>
-        {
-            typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
-            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)){};
-            T lo, hi;
-
-            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
-            {
-                I d = a - b;
-                return lo.x <= d.x && d.x <= hi.x &&
-                       lo.y <= d.y && d.y <= hi.y &&
-                       lo.z <= d.z && d.z <= hi.z;
-            }
-        };
-
-        template<typename T> struct InInterval<T, 4>
-        {
-            typedef typename VecTraits<T>::elem_type E;
-            __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
-            : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)){};
-            T lo, hi;
-
-            template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
-            {
-                I d = a - b;
-                return lo.x <= d.x && d.x <= hi.x &&
-                       lo.y <= d.y && d.y <= hi.y &&
-                       lo.z <= d.z && d.z <= hi.z &&
-                       lo.w <= d.w && d.w <= hi.w;
-            }
-        };
-
-
-        template<typename T, typename F>
-        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= image.cols || y >= image.rows) return;
-
-            T intensity = image(y, x);
-            component c = 0;
-
-            if ( x > 0 && connected(intensity, image(y, x - 1)))
-                c |= LEFT;
-
-            if ( y > 0 && connected(intensity, image(y - 1, x)))
-                c |= UP;
-
-            if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
-                c |= RIGHT;
-
-            if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
-                c |= DOWN;
-
-            components(y, x) = c;
-        }
-
-        template< typename T>
-        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
-        {
-            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
-            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
-
-            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
-
-            Int_t inInt(lo, hi);
-            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
-
-            cudaSafeCall( cudaGetLastError() );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-
-        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
-        {
-            int x = threadIdx.x + blockIdx.x * TILE_COLS;
-            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
-
-            if (x >= edges.cols || y >= edges.rows) return;
-
-            //currently x is 1
-            int bounds = ((y + TPB_Y) < edges.rows);
-
-            __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
-            __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
-
-            int new_labels[TPB_Y][TPB_X];
-            int old_labels[TPB_Y][TPB_X];
-
-            #pragma unroll
-            for (int i = 0; i < TPB_Y; ++i)
-                #pragma unroll
-                for (int j = 0; j < TPB_X; ++j)
-                {
-                    int yloc = threadIdx.y + CTA_SIZE_Y * i;
-                    int xloc = threadIdx.x + CTA_SIZE_X * j;
-                    component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
-
-                    if (!xloc) c &= ~LEFT;
-                    if (!yloc) c &= ~UP;
-
-                    if (xloc == TILE_COLS -1) c &= ~RIGHT;
-                    if (yloc == TILE_ROWS -1) c &= ~DOWN;
-
-                    new_labels[i][j] = yloc * TILE_COLS + xloc;
-                    edgesTile[yloc][xloc] = c;
-                }
-
-            for (int k = 0; ;++k)
-            {
-                //1. backup
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
-                        int xloc = threadIdx.x + CTA_SIZE_X * j;
-
-                        old_labels[i][j]       = new_labels[i][j];
-                        labelsTile[yloc][xloc] = new_labels[i][j];
-                    }
-
-                __syncthreads();
-
-                //2. compare local arrays
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        int yloc = threadIdx.y + CTA_SIZE_Y * i;
-                        int xloc = threadIdx.x + CTA_SIZE_X * j;
-
-                        component c = edgesTile[yloc][xloc];
-                        int label = new_labels[i][j];
-
-                        if (c & UP)
-                           label = ::min(label, labelsTile[yloc - 1][xloc]);
-
-                        if (c &  DOWN)
-                           label = ::min(label, labelsTile[yloc + 1][xloc]);
-
-                        if (c & LEFT)
-                           label = ::min(label, labelsTile[yloc][xloc - 1]);
-
-                        if (c & RIGHT)
-                           label = ::min(label, labelsTile[yloc][xloc + 1]);
-
-                       new_labels[i][j] = label;
-                    }
-
-                __syncthreads();
-
-                //3. determine: Is any value changed?
-                int changed = 0;
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        if (new_labels[i][j] < old_labels[i][j])
-                        {
-                            changed = 1;
-                            Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
-                        }
-                    }
-
-                changed = Emulation::syncthreadsOr(changed);
-
-                if (!changed)
-                    break;
-
-                //4. Compact paths
-                const int *labels = &labelsTile[0][0];
-                #pragma unroll
-                for (int i = 0; i < TPB_Y; ++i)
-                    #pragma unroll
-                    for (int j = 0; j < TPB_X; ++j)
-                    {
-                        int label = new_labels[i][j];
-
-                        while( labels[label] < label ) label = labels[label];
-
-                        new_labels[i][j] = label;
-                    }
-                __syncthreads();
-            }
-
-            #pragma unroll
-            for (int i = 0; i < TPB_Y; ++i)
-            #pragma unroll
-                for (int j = 0; j < TPB_X; ++j)
-                {
-                    int label = new_labels[i][j];
-                    int yloc = label / TILE_COLS;
-                    int xloc = label - yloc * TILE_COLS;
-
-                    xloc += blockIdx.x * TILE_COLS;
-                    yloc += blockIdx.y * TILE_ROWS;
-
-                    label = yloc * edges.cols + xloc;
-                    // do it for x too.
-                    if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
-                }
-        }
-
-        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
-        {
-            while(1)
-            {
-                int y = label / comps.cols;
-                int x = label - y * comps.cols;
-
-                int parent = comps(y, x);
-
-                if (label == parent) break;
-
-                label = parent;
-            }
-            return label;
-        }
-
-        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
-        {
-            int r1 = root(comps, l1);
-            int r2 = root(comps, l2);
-
-            if (r1 == r2) return;
-
-            int mi = ::min(r1, r2);
-            int ma = ::max(r1, r2);
-
-            int y = ma / comps.cols;
-            int x = ma - y * comps.cols;
-
-            atomicMin(&comps.ptr(y)[x], mi);
-            changed = true;
-        }
-
-        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
-            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
-        {
-            int tid = threadIdx.y * blockDim.x + threadIdx.x;
-            int stride = blockDim.y * blockDim.x;
-
-            int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
-            int yend   = ybegin + tilesNumY * tileSizeY;
-
-            if (blockIdx.y == gridDim.y - 1)
-            {
-                yend -= yIncomplete * tileSizeY;
-                yend -= tileSizeY;
-                tileSizeY = (edges.rows % tileSizeY);
-
-                yend += tileSizeY;
-            }
-
-            int xbegin = blockIdx.x * tilesNumX * tileSizeX;
-            int xend   = xbegin + tilesNumX * tileSizeX;
-
-            if (blockIdx.x == gridDim.x - 1)
-            {
-                if (xIncomplete) yend = ybegin;
-                xend -= xIncomplete * tileSizeX;
-                xend -= tileSizeX;
-                tileSizeX = (edges.cols % tileSizeX);
-
-                xend += tileSizeX;
-            }
-
-            if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
-            {
-                xend = xbegin;
-            }
-
-            int tasksV = (tilesNumX - 1) * (yend - ybegin);
-            int tasksH = (tilesNumY - 1) * (xend - xbegin);
-
-            int total = tasksH + tasksV;
-
-            bool changed;
-            do
-            {
-                changed = false;
-                for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
-                {
-                    if (taskIdx < tasksH)
-                    {
-                        int indexH = taskIdx;
-
-                        int row = indexH / (xend - xbegin);
-                        int col = indexH - row * (xend - xbegin);
-
-                        int y = ybegin + (row + 1) * tileSizeY;
-                        int x = xbegin + col;
-
-                        component e = edges( x, y);
-                        if (e & UP)
-                        {
-                            int lc = comps(y,x);
-                            int lu = comps(y - 1, x);
-
-                            isConnected(comps, lc, lu, changed);
-                        }
-                    }
-                    else
-                    {
-                        int indexV = taskIdx - tasksH;
-
-                        int col = indexV / (yend - ybegin);
-                        int row = indexV - col * (yend - ybegin);
-
-                        int x = xbegin + (col + 1) * tileSizeX;
-                        int y = ybegin + row;
-
-                        component e = edges(x, y);
-                        if (e & LEFT)
-                        {
-                            int lc = comps(y, x);
-                            int ll = comps(y, x - 1);
-
-                            isConnected(comps, lc, ll, changed);
-                        }
-                    }
-                }
-            } while (Emulation::syncthreadsOr(changed));
-        }
-
-        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if( x < comps.cols && y < comps.rows)
-                comps(y, x) = root(comps, comps(y, x));
-        }
-
-        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
-
-        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
-        {
-            (void) flags;
-            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
-            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
-
-            lableTiles<<<grid, block, 0, stream>>>(edges, comps);
-            cudaSafeCall( cudaGetLastError() );
-
-            int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
-            while (grid.x > 1 || grid.y > 1)
-            {
-                dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
-                dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
-                // debug log
-                // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
-                crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
-                tileSizeX <<= 1;
-                tileSizeY <<= 1;
-                grid = mergeGrid;
-
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            grid.x = divUp(edges.cols, block.x);
-            grid.y = divUp(edges.rows, block.y);
-            flatten<<<grid, block, 0, stream>>>(edges, comps);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-} } }
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/clahe.cu
+++ b/modules/gpu/src/cuda/clahe.cu
@@ -1,186 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace clahe
-{
-    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
-                                  const int2 tileSize, const int tilesX,
-                                  const int clipLimit, const float lutScale)
-    {
-        __shared__ int smem[512];
-
-        const int tx = blockIdx.x;
-        const int ty = blockIdx.y;
-        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        smem[tid] = 0;
-        __syncthreads();
-
-        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
-        {
-            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
-            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
-            {
-                const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
-            }
-        }
-
-        __syncthreads();
-
-        int tHistVal = smem[tid];
-
-        __syncthreads();
-
-        if (clipLimit > 0)
-        {
-            // clip histogram bar
-
-            int clipped = 0;
-            if (tHistVal > clipLimit)
-            {
-                clipped = tHistVal - clipLimit;
-                tHistVal = clipLimit;
-            }
-
-            // find number of overall clipped samples
-
-            reduce<256>(smem, clipped, tid, plus<int>());
-
-            // broadcast evaluated value
-
-            __shared__ int totalClipped;
-
-            if (tid == 0)
-                totalClipped = clipped;
-            __syncthreads();
-
-            // redistribute clipped samples evenly
-
-            int redistBatch = totalClipped / 256;
-            tHistVal += redistBatch;
-
-            int residual = totalClipped - redistBatch * 256;
-            if (tid < residual)
-                ++tHistVal;
-        }
-
-        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
-
-        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
-    }
-
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(tilesX, tilesY);
-
-        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= src.cols || y >= src.rows)
-            return;
-
-        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
-        int ty1 = __float2int_rd(tyf);
-        int ty2 = ty1 + 1;
-        const float ya = tyf - ty1;
-        ty1 = ::max(ty1, 0);
-        ty2 = ::min(ty2, tilesY - 1);
-
-        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
-        int tx1 = __float2int_rd(txf);
-        int tx2 = tx1 + 1;
-        const float xa = txf - tx1;
-        tx1 = ::max(tx1, 0);
-        tx2 = ::min(tx2, tilesX - 1);
-
-        const int srcVal = src(y, x);
-
-        float res = 0;
-
-        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
-        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
-        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
-        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
-
-        dst(y, x) = saturate_cast<uchar>(res);
-    }
-
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
-
-        tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif // CUDA_DISABLER
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -1,461 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/color.hpp"
-#include "cvt_color_internal.h"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
-    { \
-        traits::functor_type functor = traits::create_functor(); \
-        typedef typename traits::functor_type::argument_type src_t; \
-        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
-    }
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
-
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -1,544 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/color.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T> struct Bayer2BGR;
-
-    template <> struct Bayer2BGR<uchar>
-    {
-        uchar3 res0;
-        uchar3 res1;
-        uchar3 res2;
-        uchar3 res3;
-
-        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
-        {
-            uchar4 patch[3][3];
-            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
-            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
-            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
-            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            if ((s_y & 1) ^ start_with_green)
-            {
-                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
-                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
-
-                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
-                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
-
-                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
-                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
-
-                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
-                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = t1;
-                    res0.y = patch[1][1].x;
-                    res0.z = t0;
-
-                    res1.x = patch[1][1].y;
-                    res1.y = t3;
-                    res1.z = t2;
-
-                    res2.x = t5;
-                    res2.y = patch[1][1].z;
-                    res2.z = t4;
-
-                    res3.x = patch[1][1].w;
-                    res3.y = t7;
-                    res3.z = t6;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = patch[1][1].x;
-                    res0.z = t1;
-
-                    res1.x = t2;
-                    res1.y = t3;
-                    res1.z = patch[1][1].y;
-
-                    res2.x = t4;
-                    res2.y = patch[1][1].z;
-                    res2.z = t5;
-
-                    res3.x = t6;
-                    res3.y = t7;
-                    res3.z = patch[1][1].w;
-                }
-            }
-            else
-            {
-                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
-                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
-
-                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
-                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
-
-                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
-                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
-
-                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
-                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = patch[1][1].x;
-                    res0.y = t1;
-                    res0.z = t0;
-
-                    res1.x = t3;
-                    res1.y = patch[1][1].y;
-                    res1.z = t2;
-
-                    res2.x = patch[1][1].z;
-                    res2.y = t5;
-                    res2.z = t4;
-
-                    res3.x = t7;
-                    res3.y = patch[1][1].w;
-                    res3.z = t6;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = t1;
-                    res0.z = patch[1][1].x;
-
-                    res1.x = t2;
-                    res1.y = patch[1][1].y;
-                    res1.z = t3;
-
-                    res2.x = t4;
-                    res2.y = t5;
-                    res2.z = patch[1][1].z;
-
-                    res3.x = t6;
-                    res3.y = patch[1][1].w;
-                    res3.z = t7;
-                }
-            }
-        }
-    };
-
-    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
-    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
-    {
-        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
-        return f(pix);
-    }
-    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
-    {
-        return pix;
-    }
-    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
-    {
-        return make_uchar4(pix.x, pix.y, pix.z, 255);
-    }
-
-    template <typename D>
-    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
-    {
-        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (s_y >= src.rows || (s_x << 2) >= src.cols)
-            return;
-
-        s_y = ::min(::max(s_y, 1), src.rows - 2);
-
-        Bayer2BGR<uchar> bayer;
-        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
-
-        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        dst(d_y, d_x) = toDst<D>(bayer.res0);
-        if (d_x + 1 < src.cols)
-            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
-        if (d_x + 2 < src.cols)
-            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
-        if (d_x + 3 < src.cols)
-            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
-    }
-
-    template <> struct Bayer2BGR<ushort>
-    {
-        ushort3 res0;
-        ushort3 res1;
-
-        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
-        {
-            ushort2 patch[3][3];
-            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
-            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
-            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
-            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            if ((s_y & 1) ^ start_with_green)
-            {
-                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
-                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
-
-                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
-                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = t1;
-                    res0.y = patch[1][1].x;
-                    res0.z = t0;
-
-                    res1.x = patch[1][1].y;
-                    res1.y = t3;
-                    res1.z = t2;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = patch[1][1].x;
-                    res0.z = t1;
-
-                    res1.x = t2;
-                    res1.y = t3;
-                    res1.z = patch[1][1].y;
-                }
-            }
-            else
-            {
-                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
-                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
-
-                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
-                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = patch[1][1].x;
-                    res0.y = t1;
-                    res0.z = t0;
-
-                    res1.x = t3;
-                    res1.y = patch[1][1].y;
-                    res1.z = t2;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = t1;
-                    res0.z = patch[1][1].x;
-
-                    res1.x = t2;
-                    res1.y = patch[1][1].y;
-                    res1.z = t3;
-                }
-            }
-        }
-    };
-
-    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
-    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
-    {
-        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
-        return f(pix);
-    }
-    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
-    {
-        return pix;
-    }
-    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
-    {
-        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
-    }
-
-    template <typename D>
-    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
-    {
-        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (s_y >= src.rows || (s_x << 1) >= src.cols)
-            return;
-
-        s_y = ::min(::max(s_y, 1), src.rows - 2);
-
-        Bayer2BGR<ushort> bayer;
-        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
-
-        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        dst(d_y, d_x) = toDst<D>(bayer.res0);
-        if (d_x + 1 < src.cols)
-            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
-    }
-
-    template <int cn>
-    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-    {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
-
-        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <int cn>
-    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-    {
-        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
-
-        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////
-    // Bayer Demosaicing (Malvar, He, and Cutler)
-    //
-    // by Morgan McGuire, Williams College
-    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
-    //
-    // ported to CUDA
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    template <typename DstType>
-    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
-    {
-        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
-        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
-        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
-        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
-        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
-        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
-            return;
-
-        int2 center;
-        center.x = x + sourceOffset.x;
-        center.y = y + sourceOffset.y;
-
-        int4 xCoord;
-        xCoord.x = center.x - 2;
-        xCoord.y = center.x - 1;
-        xCoord.z = center.x + 1;
-        xCoord.w = center.x + 2;
-
-        int4 yCoord;
-        yCoord.x = center.y - 2;
-        yCoord.y = center.y - 1;
-        yCoord.z = center.y + 1;
-        yCoord.w = center.y + 2;
-
-        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
-
-        float4 Dvec;
-        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
-        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
-        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
-        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
-
-        float4 value;
-        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
-        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
-        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
-        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
-
-        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
-        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
-        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
-        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
-        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
-
-        float4 PATTERN;
-        PATTERN.x = kCx * C;
-        PATTERN.y = kCy * C;
-        PATTERN.z = kCz * C;
-        PATTERN.w = PATTERN.z;
-
-        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
-
-        // There are five filter patterns (identity, cross, checker,
-        // theta, phi). Precompute the terms from all of them and then
-        // use swizzles to assign to color channels.
-        //
-        // Channel Matches
-        // x cross (e.g., EE G)
-        // y checker (e.g., EE B)
-        // z theta (e.g., EO R)
-        // w phi (e.g., EO B)
-
-        #define A value.x  // A0 + A1
-        #define B value.y  // B0 + B1
-        #define E value.z  // E0 + E1
-        #define F value.w  // F0 + F1
-
-        float3 temp;
-
-        // PATTERN.yzw += (kD.yz * D).xyy;
-        temp.x = kDy * D;
-        temp.y = kDz * D;
-        PATTERN.y += temp.x;
-        PATTERN.z += temp.y;
-        PATTERN.w += temp.y;
-
-        // PATTERN += (kA.xyz * A).xyzx;
-        temp.x = kAx * A;
-        temp.y = kAy * A;
-        temp.z = kAz * A;
-        PATTERN.x += temp.x;
-        PATTERN.y += temp.y;
-        PATTERN.z += temp.z;
-        PATTERN.w += temp.x;
-
-        // PATTERN += (kE.xyw * E).xyxz;
-        temp.x = kEx * E;
-        temp.y = kEy * E;
-        temp.z = kEw * E;
-        PATTERN.x += temp.x;
-        PATTERN.y += temp.y;
-        PATTERN.z += temp.x;
-        PATTERN.w += temp.z;
-
-        // PATTERN.xw += kB.xw * B;
-        PATTERN.x += kBx * B;
-        PATTERN.w += kBw * B;
-
-        // PATTERN.xz += kF.xz * F;
-        PATTERN.x += kFx * F;
-        PATTERN.z += kFz * F;
-
-        // Determine which of four types of pixels we are on.
-        int2 alternate;
-        alternate.x = (x + firstRed.x) % 2;
-        alternate.y = (y + firstRed.y) % 2;
-
-        // in BGR sequence;
-        uchar3 pixelColor =
-            (alternate.y == 0) ?
-                ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
-                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
-                ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
-                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
-
-        dst(y, x) = toDst<DstType>(pixelColor);
-    }
-
-    template <int cn>
-    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
-    {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        bindTexture(&sourceTex, src);
-
-        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -1,143 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace gfft
-    {
-        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __device__ int g_counter = 0;
-
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
-        {
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
-            {
-                float val = tex2D(eigTex, j, i);
-
-                if (val > threshold)
-                {
-                    float maxVal = val;
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
-
-                    if (val == maxVal)
-                    {
-                        const int ind = ::atomicAdd(&g_counter, 1);
-
-                        if (ind < max_count)
-                            corners[ind] = make_float2(j, i);
-                    }
-                }
-            }
-        }
-
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-            bindTexture(&eigTex, eig);
-
-            dim3 block(16, 16);
-            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
-
-            if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
-            else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            return std::min(count, max_count);
-        }
-
-        class EigGreater
-        {
-        public:
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
-            {
-                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
-            }
-        };
-
-
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
-        {
-            bindTexture(&eigTex, eig);
-
-            thrust::device_ptr<float2> ptr(corners);
-
-            thrust::sort(ptr, ptr + count, EigGreater());
-        }
-    } // namespace optical_flow
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -1,153 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace hist
-{
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
-    {
-        __shared__ int shist[256];
-
-        const int y = blockIdx.x * blockDim.y + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        shist[tid] = 0;
-        __syncthreads();
-
-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-
-                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
-            }
-
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
-            }
-        }
-
-        __syncthreads();
-
-        const int histVal = shist[tid];
-        if (histVal > 0)
-            ::atomicAdd(hist + tid, histVal);
-    }
-
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-
-namespace hist
-{
-    __constant__ int c_lut[256];
-
-    struct EqualizeHist : unary_function<uchar, uchar>
-    {
-        float scale;
-
-        __host__ EqualizeHist(float _scale) : scale(_scale) {}
-
-        __device__ __forceinline__ uchar operator ()(uchar val) const
-        {
-            const int lut = c_lut[val];
-            return __float2int_rn(scale * lut);
-        }
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace hist
-{
-    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-    {
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        const float scale = 255.0f / (src.cols * src.rows);
-
-        cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -1,754 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "internal_shared.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
-
-        texture<uchar4, 2> tex_meanshift;
-
-        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
-                                        size_t out_step, int cols, int rows,
-                                        int sp, int sr, int maxIter, float eps)
-        {
-            int isr2 = sr*sr;
-            uchar4 c = tex2D(tex_meanshift, x0, y0 );
-
-            // iterate meanshift procedure
-            for( int iter = 0; iter < maxIter; iter++ )
-            {
-                int count = 0;
-                int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
-                float icount;
-
-                //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
-                int minx = x0-sp;
-                int miny = y0-sp;
-                int maxx = x0+sp;
-                int maxy = y0+sp;
-
-                for( int y = miny; y <= maxy; y++)
-                {
-                    int rowCount = 0;
-                    for( int x = minx; x <= maxx; x++ )
-                    {
-                        uchar4 t = tex2D( tex_meanshift, x, y );
-
-                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
-                        if( norm2 <= isr2 )
-                        {
-                            s0 += t.x; s1 += t.y; s2 += t.z;
-                            sx += x; rowCount++;
-                        }
-                    }
-                    count += rowCount;
-                    sy += y*rowCount;
-                }
-
-                if( count == 0 )
-                    break;
-
-                icount = 1.f/count;
-                int x1 = __float2int_rz(sx*icount);
-                int y1 = __float2int_rz(sy*icount);
-                s0 = __float2int_rz(s0*icount);
-                s1 = __float2int_rz(s1*icount);
-                s2 = __float2int_rz(s2*icount);
-
-                int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
-
-                bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
-
-                x0 = x1; y0 = y1;
-                c.x = s0; c.y = s1; c.z = s2;
-
-                if( stopFlag )
-                    break;
-            }
-
-            int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
-            *(uchar4*)(out + base) = c;
-
-            return make_short2((short)x0, (short)y0);
-        }
-
-        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
-        {
-            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
-            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if( x0 < cols && y0 < rows )
-                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
-        }
-
-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
-                                             unsigned char* outsp, size_t outspstep,
-                                             int cols, int rows,
-                                             int sp, int sr, int maxIter, float eps)
-        {
-            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
-            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if( x0 < cols && y0 < rows )
-            {
-                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
-            }
-        }
-
-        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
-        {
-            dim3 grid(1, 1, 1);
-            dim3 threads(32, 8, 1);
-            grid.x = divUp(src.cols, threads.x);
-            grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
-        }
-
-        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
-        {
-            dim3 grid(1, 1, 1);
-            dim3 threads(32, 8, 1);
-            grid.x = divUp(src.cols, threads.x);
-            grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
-        }
-
-        /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
-
-        template <typename T>
-        __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
-        {
-            unsigned int H = ((ndisp-d) * 240)/ndisp;
-
-            unsigned int hi = (H/60) % 6;
-            float f = H/60.f - H/60;
-            float p = V * (1 - S);
-            float q = V * (1 - f * S);
-            float t = V * (1 - (1 - f) * S);
-
-            float3 res;
-
-            if (hi == 0) //R = V,	G = t,	B = p
-            {
-                res.x = p;
-                res.y = t;
-                res.z = V;
-            }
-
-            if (hi == 1) // R = q,	G = V,	B = p
-            {
-                res.x = p;
-                res.y = V;
-                res.z = q;
-            }
-
-            if (hi == 2) // R = p,	G = V,	B = t
-            {
-                res.x = t;
-                res.y = V;
-                res.z = p;
-            }
-
-            if (hi == 3) // R = p,	G = q,	B = V
-            {
-                res.x = V;
-                res.y = q;
-                res.z = p;
-            }
-
-            if (hi == 4) // R = t,	G = p,	B = V
-            {
-                res.x = V;
-                res.y = p;
-                res.z = t;
-            }
-
-            if (hi == 5) // R = V,	G = p,	B = q
-            {
-                res.x = q;
-                res.y = p;
-                res.z = V;
-            }
-            const unsigned int b = (unsigned int)(::max(0.f, ::min(res.x, 1.f)) * 255.f);
-            const unsigned int g = (unsigned int)(::max(0.f, ::min(res.y, 1.f)) * 255.f);
-            const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
-            const unsigned int a = 255U;
-
-            return (a << 24) + (r << 16) + (g << 8) + b;
-        }
-
-        __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if(x < width && y < height)
-            {
-                uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);
-
-                uint4 res;
-                res.x = cvtPixel(d4.x, ndisp);
-                res.y = cvtPixel(d4.y, ndisp);
-                res.z = cvtPixel(d4.z, ndisp);
-                res.w = cvtPixel(d4.w, ndisp);
-
-                uint4* line = (uint4*)(out_image + y * out_step);
-                line[x >> 2] = res;
-            }
-        }
-
-        __global__ void drawColorDisp(short* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if(x < width && y < height)
-            {
-                short2 d2 = *(short2*)(disp + y * disp_step + x);
-
-                uint2 res;
-                res.x = cvtPixel(d2.x, ndisp);
-                res.y = cvtPixel(d2.y, ndisp);
-
-                uint2* line = (uint2*)(out_image + y * out_step);
-                line[x >> 1] = res;
-            }
-        }
-
-
-        void drawColorDisp_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
-        {
-            dim3 threads(16, 16, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(src.cols, threads.x << 2);
-            grid.y = divUp(src.rows, threads.y);
-
-            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void drawColorDisp_gpu(const PtrStepSz<short>& src, const PtrStepSzb& dst, int ndisp, const cudaStream_t& stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(src.cols, threads.x << 1);
-            grid.y = divUp(src.rows, threads.y);
-
-            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
-
-        __constant__ float cq[16];
-
-        template <typename T, typename D>
-        __global__ void reprojectImageTo3D(const PtrStepSz<T> disp, PtrStep<D> xyz)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y >= disp.rows || x >= disp.cols)
-                return;
-
-            const float qx = x * cq[ 0] + y * cq[ 1] + cq[ 3];
-            const float qy = x * cq[ 4] + y * cq[ 5] + cq[ 7];
-            const float qz = x * cq[ 8] + y * cq[ 9] + cq[11];
-            const float qw = x * cq[12] + y * cq[13] + cq[15];
-
-            const T d = disp(y, x);
-
-            const float iW = 1.f / (qw + cq[14] * d);
-
-            D v = VecTraits<D>::all(1.0f);
-            v.x = (qx + cq[2] * d) * iW;
-            v.y = (qy + cq[6] * d) * iW;
-            v.z = (qz + cq[10] * d) * iW;
-
-            xyz(y, x) = v;
-        }
-
-        template <typename T, typename D>
-        void reprojectImageTo3D_gpu(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(disp.cols, block.x), divUp(disp.rows, block.y));
-
-            cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
-
-            reprojectImageTo3D<T, D><<<grid, block, 0, stream>>>((PtrStepSz<T>)disp, (PtrStepSz<D>)xyz);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void reprojectImageTo3D_gpu<uchar, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<uchar, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<short, float3>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-        template void reprojectImageTo3D_gpu<short, float4>(const PtrStepSzb disp, PtrStepSzb xyz, const float* q, cudaStream_t stream);
-
-        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
-
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        float dx = tex2D(harrisDxTex, j, i);
-                        float dy = tex2D(harrisDyTex, j, i);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
-            }
-        }
-
-        template <typename BR, typename BC>
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    const int y = border_col.idx_row(i);
-
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        const int x = border_row.idx_col(j);
-
-                        float dx = tex2D(harrisDxTex, x, y);
-                        float dy = tex2D(harrisDyTex, x, y);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
-            }
-        }
-
-        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&harrisDxTex, Dx);
-            bindTexture(&harrisDyTex, Dy);
-
-            switch (border_type)
-            {
-            case BORDER_REFLECT101_GPU:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
-                break;
-
-            case BORDER_REFLECT_GPU:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
-                break;
-
-            case BORDER_REPLICATE_GPU:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
-
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        float dx = tex2D(minEigenValDxTex, j, i);
-                        float dy = tex2D(minEigenValDyTex, j, i);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                a *= 0.5f;
-                c *= 0.5f;
-
-                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
-            }
-        }
-
-
-        template <typename BR, typename BC>
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                float a = 0.f;
-                float b = 0.f;
-                float c = 0.f;
-
-                const int ibegin = y - (block_size / 2);
-                const int jbegin = x - (block_size / 2);
-                const int iend = ibegin + block_size;
-                const int jend = jbegin + block_size;
-
-                for (int i = ibegin; i < iend; ++i)
-                {
-                    int y = border_col.idx_row(i);
-
-                    for (int j = jbegin; j < jend; ++j)
-                    {
-                        int x = border_row.idx_col(j);
-
-                        float dx = tex2D(minEigenValDxTex, x, y);
-                        float dy = tex2D(minEigenValDyTex, x, y);
-
-                        a += dx * dx;
-                        b += dx * dy;
-                        c += dy * dy;
-                    }
-                }
-
-                a *= 0.5f;
-                c *= 0.5f;
-
-                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
-            }
-        }
-
-        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&minEigenValDxTex, Dx);
-            bindTexture(&minEigenValDyTex, Dy);
-
-            switch (border_type)
-            {
-            case BORDER_REFLECT101_GPU:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
-                break;
-
-            case BORDER_REFLECT_GPU:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
-                break;
-
-            case BORDER_REPLICATE_GPU:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        //////////////////////////////////////////////////////////////////////////
-        // buildWarpMaps
-
-        // TODO use intrinsics like __sinf and so on
-
-        namespace build_warp_maps
-        {
-
-            __constant__ float ck_rinv[9];
-            __constant__ float cr_kinv[9];
-            __constant__ float ct[3];
-            __constant__ float cscale;
-        }
-
-
-        class PlaneMapper
-        {
-        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-            {
-                using namespace build_warp_maps;
-
-                float x_ = u / cscale - ct[0];
-                float y_ = v / cscale - ct[1];
-
-                float z;
-                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * (1 - ct[2]);
-                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * (1 - ct[2]);
-                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * (1 - ct[2]);
-
-                x /= z;
-                y /= z;
-            }
-        };
-
-
-        class CylindricalMapper
-        {
-        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-            {
-                using namespace build_warp_maps;
-
-                u /= cscale;
-                float x_ = ::sinf(u);
-                float y_ = v / cscale;
-                float z_ = ::cosf(u);
-
-                float z;
-                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
-
-                if (z > 0) { x /= z; y /= z; }
-                else x = y = -1;
-            }
-        };
-
-
-        class SphericalMapper
-        {
-        public:
-            static __device__ __forceinline__ void mapBackward(float u, float v, float &x, float &y)
-            {
-                using namespace build_warp_maps;
-
-                v /= cscale;
-                u /= cscale;
-
-                float sinv = ::sinf(v);
-                float x_ = sinv * ::sinf(u);
-                float y_ = -::cosf(v);
-                float z_ = sinv * ::cosf(u);
-
-                float z;
-                x = ck_rinv[0] * x_ + ck_rinv[1] * y_ + ck_rinv[2] * z_;
-                y = ck_rinv[3] * x_ + ck_rinv[4] * y_ + ck_rinv[5] * z_;
-                z = ck_rinv[6] * x_ + ck_rinv[7] * y_ + ck_rinv[8] * z_;
-
-                if (z > 0) { x /= z; y /= z; }
-                else x = y = -1;
-            }
-        };
-
-
-        template <typename Mapper>
-        __global__ void buildWarpMapsKernel(int tl_u, int tl_v, int cols, int rows,
-                                            PtrStepf map_x, PtrStepf map_y)
-        {
-            int du = blockIdx.x * blockDim.x + threadIdx.x;
-            int dv = blockIdx.y * blockDim.y + threadIdx.y;
-            if (du < cols && dv < rows)
-            {
-                float u = tl_u + du;
-                float v = tl_v + dv;
-                float x, y;
-                Mapper::mapBackward(u, v, x, y);
-                map_x.ptr(dv)[du] = x;
-                map_y.ptr(dv)[du] = y;
-            }
-        }
-
-
-        void buildWarpPlaneMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3],
-                                float scale, cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ct, t, 3*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
-
-            int cols = map_x.cols;
-            int rows = map_x.rows;
-
-            dim3 threads(32, 8);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            buildWarpMapsKernel<PlaneMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-            cudaSafeCall(cudaGetLastError());
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void buildWarpCylindricalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                      const float k_rinv[9], const float r_kinv[9], float scale,
-                                      cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
-
-            int cols = map_x.cols;
-            int rows = map_x.rows;
-
-            dim3 threads(32, 8);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            buildWarpMapsKernel<CylindricalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-            cudaSafeCall(cudaGetLastError());
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void buildWarpSphericalMaps(int tl_u, int tl_v, PtrStepSzf map_x, PtrStepSzf map_y,
-                                    const float k_rinv[9], const float r_kinv[9], float scale,
-                                    cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cr_kinv, r_kinv, 9*sizeof(float)));
-            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::cscale, &scale, sizeof(float)));
-
-            int cols = map_x.cols;
-            int rows = map_x.rows;
-
-            dim3 threads(32, 8);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            buildWarpMapsKernel<SphericalMapper><<<grid,threads>>>(tl_u, tl_v, cols, rows, map_x, map_y);
-            cudaSafeCall(cudaGetLastError());
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev {
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -1,916 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace match_template
-    {
-        __device__ __forceinline__ float sum(float v) { return v; }
-        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
-        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
-        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
-
-        __device__ __forceinline__ float first(float v) { return v; }
-        __device__ __forceinline__ float first(float2 v) { return v.x; }
-        __device__ __forceinline__ float first(float3 v) { return v.x; }
-        __device__ __forceinline__ float first(float4 v) { return v.x; }
-
-        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
-        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
-        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
-        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
-        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-        //////////////////////////////////////////////////////////////////////
-        // Naive_CCORR
-
-        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
-        {
-            typedef typename TypeVec<T, cn>::vec_type Type;
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef res = VecTraits<Typef>::all(0);
-
-                for (int i = 0; i < h; ++i)
-                {
-                    const Type* image_ptr = (const Type*)image.ptr(y + i);
-                    const Type* templ_ptr = (const Type*)templ.ptr(i);
-                    for (int j = 0; j < w; ++j)
-                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
-                }
-
-                result.ptr(y)[x] = sum(res);
-            }
-        }
-
-        template <typename T, int cn>
-        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-
-        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Naive_SQDIFF
-
-        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
-        {
-            typedef typename TypeVec<T, cn>::vec_type Type;
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef res = VecTraits<Typef>::all(0);
-                Typef delta;
-
-                for (int i = 0; i < h; ++i)
-                {
-                    const Type* image_ptr = (const Type*)image.ptr(y + i);
-                    const Type* templ_ptr = (const Type*)templ.ptr(i);
-                    for (int j = 0; j < w; ++j)
-                    {
-                        delta = sub(image_ptr[x + j], templ_ptr[j]);
-                        res = res + delta * delta;
-                    }
-                }
-
-                result.ptr(y)[x] = sum(res);
-            }
-        }
-
-        template <typename T, int cn>
-        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_SQDIFF
-
-        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
-            }
-        }
-
-        template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
-                                             cudaStream_t stream)
-        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
-            };
-
-            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_SQDIFF_NORMED
-
-        // normAcc* are accurate normalization routines which make GPU matchTemplate
-        // consistent with CPU one
-
-        __device__ float normAcc(float num, float denum)
-        {
-            if (::fabs(num) < denum)
-                return num / denum;
-            if (::fabs(num) < denum * 1.125f)
-                return num > 0 ? 1 : -1;
-            return 0;
-        }
-
-
-        __device__ float normAcc_SQDIFF(float num, float denum)
-        {
-            if (::fabs(num) < denum)
-                return num / denum;
-            if (::fabs(num) < denum * 1.125f)
-                return num > 0 ? 1 : -1;
-            return 1;
-        }
-
-
-        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
-                                                  sqrtf(image_sqsum_ * templ_sqsum));
-            }
-        }
-
-        template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
-            static const caller_t callers[] =
-            {
-                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
-            };
-
-            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_CCOFF
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_ = (float)(
-                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
-                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC2(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
-                    image_sum_r, image_sum_g, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
-                int w, int h,
-                float templ_sum_scale_r,
-                float templ_sum_scale_g,
-                float templ_sum_scale_b,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g
-                                         - image_sum_b_ * templ_sum_scale_b;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h,
-                    (float)templ_sum_r / (w * h),
-                    (float)templ_sum_g / (w * h),
-                    (float)templ_sum_b / (w * h),
-                    image_sum_r, image_sum_g, image_sum_b, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
-                int w, int h,
-                float templ_sum_scale_r,
-                float templ_sum_scale_g,
-                float templ_sum_scale_b,
-                float templ_sum_scale_a,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                const PtrStep<unsigned int> image_sum_a,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sum_a_ = (float)(
-                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g
-                                         - image_sum_b_ * templ_sum_scale_b
-                                         - image_sum_a_ * templ_sum_scale_a;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h,
-                    (float)templ_sum_r / (w * h),
-                    (float)templ_sum_g / (w * h),
-                    (float)templ_sum_b / (w * h),
-                    (float)templ_sum_a / (w * h),
-                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_CCOFF_NORMED
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
-                int w, int h, float weight,
-                float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum,
-                const PtrStep<unsigned long long> image_sqsum,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float ccorr = result.ptr(y)[x];
-                float image_sum_ = (float)(
-                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
-                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
-                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
-                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const PtrStepSz<unsigned int> image_sum,
-                    const PtrStepSz<unsigned long long> image_sqsum,
-                    unsigned int templ_sum, unsigned long long templ_sqsum,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale = templ_sum * weight;
-            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
-                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
-                    image_sum, image_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g,
-                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                             - image_sum_g_ * templ_sum_scale_g;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
-                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sqsum_b_ = (float)(
-                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                             - image_sum_g_ * templ_sum_scale_g
-                                             - image_sum_b_ * templ_sum_scale_b;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    image_sum_b, image_sqsum_b,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
-                float templ_sum_scale_a, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sqsum_b_ = (float)(
-                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-                float image_sum_a_ = (float)(
-                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-                float image_sqsum_a_ = (float)(
-                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
-                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
-                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
-                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sum_scale_a = templ_sum_a * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
-                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    image_sum_b, image_sqsum_b,
-                    image_sum_a, image_sqsum_a,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // normalize
-
-        template <int cn>
-        __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
-            }
-        }
-
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            switch (cn)
-            {
-            case 1:
-                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 2:
-                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 3:
-                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 4:
-                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // extractFirstChannel
-
-        template <int cn>
-        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
-        {
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef val = ((const Typef*)image.ptr(y))[x];
-                result.ptr(y)[x] = first(val);
-            }
-        }
-
-        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            switch (cn)
-            {
-            case 1:
-                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 2:
-                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 3:
-                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 4:
-                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            }
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } //namespace match_template
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -1,569 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm2(const float& v) { return v*v; }
-        __device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; }
-        __device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
-        __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z  + v.w*v.w; }
-
-        template<typename T, typename B>
-        __global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-            const int i = blockDim.y * blockIdx.y + threadIdx.y;
-            const int j = blockDim.x * blockIdx.x + threadIdx.x;
-
-            if (j >= dst.cols || i >= dst.rows)
-                return;
-
-            int bsize = search_radius + block_radius;
-            int search_window = 2 * search_radius + 1;
-            float minus_search_window2_inv = -1.f/(search_window * search_window);
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0.f;
-
-            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
-            {
-                for(float y = -search_radius; y <= search_radius; ++y)
-                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {
-                        float dist2 = 0;
-                        for(float ty = -block_radius; ty <= block_radius; ++ty)
-                            for(float tx = -block_radius; tx <= block_radius; ++tx)
-                            {
-                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
-                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
-
-                                dist2 += norm2(av - bv);
-                            }
-
-                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
-
-                        /*if (i == 255 && j == 255)
-                            printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/
-
-                        sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x));
-                        sum2 += w;
-                    }
-            }
-            else
-            {
-                for(float y = -search_radius; y <= search_radius; ++y)
-                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {
-                        float dist2 = 0;
-                        for(float ty = -block_radius; ty <= block_radius; ++ty)
-                            for(float tx = -block_radius; tx <= block_radius; ++tx)
-                            {
-                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
-                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
-                                dist2 += norm2(av - bv);
-                            }
-
-                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
-
-                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
-                        sum2 += w;
-                    }
-
-            }
-
-            dst(i, j) = saturate_cast<T>(sum1 / sum2);
-
-        }
-
-        template<typename T, template <typename> class B>
-        void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            int block_window = 2 * block_radius + 1;
-            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
-            float noise_mult = minus_h2_inv/(block_window * block_window);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);
-
-            static func_t funcs[] =
-            {
-                nlm_caller<T, BrdReflect101>,
-                nlm_caller<T, BrdReplicate>,
-                nlm_caller<T, BrdConstant>,
-                nlm_caller<T, BrdReflect>,
-                nlm_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, search_radius, block_radius, h, stream);
-        }
-
-        template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-        template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-        template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-    }
-}}}
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (fast approximate version)
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-
-        template <int cn> struct Unroll;
-        template <> struct Unroll<1>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
-            {
-                return thrust::tie(val1, val2);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op);
-            }
-        };
-        template <> struct Unroll<2>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op);
-            }
-        };
-        template <> struct Unroll<3>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y, val2.z);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op, op);
-            }
-        };
-        template <> struct Unroll<4>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op, op, op);
-            }
-        };
-
-        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
-        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
-        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
-
-        template <class T> struct FastNonLocalMenas
-        {
-            enum
-            {
-                CTA_SIZE = 128,
-
-                TILE_COLS = 128,
-                TILE_ROWS = 32,
-
-                STRIDE = CTA_SIZE
-            };
-
-            struct plus
-            {
-                __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; }
-            };
-
-            int search_radius;
-            int block_radius;
-
-            int search_window;
-            int block_window;
-            float minus_h2_inv;
-
-            FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
-                search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
-
-            PtrStep<T> src;
-            mutable PtrStepi buffer;
-
-            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    dist_sums[index] = 0;
-
-                    for(int tx = 0; tx < block_window; ++tx)
-                        col_sums(tx, index) = 0;
-
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int ay = i;
-                    int ax = j;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius;
-
-#if 1
-                    for (int tx = -block_radius; tx <= block_radius; ++tx)
-                    {
-                        int col_sum = 0;
-                        for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        {
-                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
-
-                            dist_sums[index] += dist;
-                            col_sum += dist;
-                        }
-                        col_sums(tx + block_radius, index) = col_sum;
-                    }
-#else
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        for (int tx = -block_radius; tx <= block_radius; ++tx)
-                        {
-                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
-
-                            dist_sums[index] += dist;
-                            col_sums(tx + block_radius, index) += dist;
-                        }
-#endif
-
-                    up_col_sums(j, index) = col_sums(block_window - 1, index);
-                }
-            }
-
-            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int ay = i;
-                    int ax = j + block_radius;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius + block_radius;
-
-                    int col_sum = 0;
-
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
-
-                    dist_sums[index] += col_sum - col_sums(first, index);
-
-                    col_sums(first, index) = col_sum;
-                    up_col_sums(j, index) = col_sum;
-                }
-            }
-
-            __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                int ay = i;
-                int ax = j + block_radius;
-
-                T a_up   = src(ay - block_radius - 1, ax);
-                T a_down = src(ay + block_radius, ax);
-
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius + block_radius;
-
-                    T b_up   = src(by - block_radius - 1, bx);
-                    T b_down = src(by + block_radius, bx);
-
-                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-
-                    dist_sums[index] += col_sum  - col_sums(first, index);
-                    col_sums(first, index) = col_sum;
-                    up_col_sums(j, index) = col_sum;
-                }
-            }
-
-            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
-
-                float weights_sum = 0;
-                sum_type sum = VecTraits<sum_type>::all(0);
-
-                float bw2_inv = 1.f/(block_window * block_window);
-
-                int sx = j - search_radius;
-                int sy = i - search_radius;
-
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    float avg_dist = dist_sums[index] * bw2_inv;
-                    float weight = __expf(avg_dist * minus_h2_inv);
-                    weights_sum += weight;
-
-                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
-                }
-
-                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
-
-                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
-                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
-                                 threadIdx.x,
-                                 Unroll<VecTraits<T>::cn>::op());
-
-                if (threadIdx.x == 0)
-                    dst = saturate_cast<T>(sum / weights_sum);
-            }
-
-            __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
-            {
-                int tbx = blockIdx.x * TILE_COLS;
-                int tby = blockIdx.y * TILE_ROWS;
-
-                int tex = ::min(tbx + TILE_COLS, dst.cols);
-                int tey = ::min(tby + TILE_ROWS, dst.rows);
-
-                PtrStepi col_sums;
-                col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
-                col_sums.step = buffer.step;
-
-                PtrStepi up_col_sums;
-                up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
-                up_col_sums.step = buffer.step;
-
-                extern __shared__ int dist_sums[]; //search_window * search_window
-
-                int first = 0;
-
-                for (int i = tby; i < tey; ++i)
-                    for (int j = tbx; j < tex; ++j)
-                    {
-                        __syncthreads();
-
-                        if (j == tbx)
-                        {
-                            initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
-                            first = 0;
-                        }
-                        else
-                        {
-                            if (i == tby)
-                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
-                            else
-                              shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
-
-                            first = (first + 1) % block_window;
-                        }
-
-                        __syncthreads();
-
-                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
-                    }
-            }
-
-        };
-
-        template<typename T>
-        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
-
-        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-        {
-            typedef FastNonLocalMenas<uchar> FNLM;
-            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
-
-            buffer_cols = search_window * search_window * grid.y;
-            buffer_rows = src.cols + block_window * grid.x;
-        }
-
-        template<typename T>
-        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
-                          int search_window, int block_window, float h, cudaStream_t stream)
-        {
-            typedef FastNonLocalMenas<T> FNLM;
-            FNLM fnlm(search_window, block_window, h);
-
-            fnlm.src = (PtrStepSz<T>)src;
-            fnlm.buffer = buffer;
-
-            dim3 block(FNLM::CTA_SIZE, 1);
-            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
-            int smem = search_window * search_window * sizeof(int);
-
-
-            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float,  cudaStream_t);
-        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-
-
-
-        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x < lab.cols && y < lab.rows)
-            {
-                uchar3 p = lab(y, x);
-                ab(y,x) = make_uchar2(p.y, p.z);
-                l(y,x) = p.x;
-            }
-        }
-
-        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
-        {
-            dim3 b(32, 8);
-            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
-
-            fnlm_split_kernel<<<g, b>>>(lab, l, ab);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x < lab.cols && y < lab.rows)
-            {
-                uchar2 p = ab(y, x);
-                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
-            }
-        }
-
-        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream)
-        {
-            dim3 b(32, 8);
-            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
-
-            fnlm_merge_kernel<<<g, b>>>(l, ab, lab);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -1,228 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
-
-            __shared__ work_t smem[256 + 4];
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y;
-
-            const int src_y = 2 * y;
-
-            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, x);
-                    sum = sum + 0.25f   * src(src_y - 1, x);
-                    sum = sum + 0.375f  * src(src_y    , x);
-                    sum = sum + 0.25f   * src(src_y + 1, x);
-                    sum = sum + 0.0625f * src(src_y + 2, x);
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, left_x);
-                    sum = sum + 0.25f   * src(src_y - 1, left_x);
-                    sum = sum + 0.375f  * src(src_y    , left_x);
-                    sum = sum + 0.25f   * src(src_y + 1, left_x);
-                    sum = sum + 0.0625f * src(src_y + 2, left_x);
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, right_x);
-                    sum = sum + 0.25f   * src(src_y - 1, right_x);
-                    sum = sum + 0.375f  * src(src_y    , right_x);
-                    sum = sum + 0.25f   * src(src_y + 1, right_x);
-                    sum = sum + 0.0625f * src(src_y + 2, right_x);
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-            else
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-
-            __syncthreads();
-
-            if (threadIdx.x < 128)
-            {
-                const int tid2 = threadIdx.x * 2;
-
-                work_t sum;
-
-                sum =       0.0625f * smem[2 + tid2 - 2];
-                sum = sum + 0.25f   * smem[2 + tid2 - 1];
-                sum = sum + 0.375f  * smem[2 + tid2    ];
-                sum = sum + 0.25f   * smem[2 + tid2 + 1];
-                sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-                if (dst_x < dst_cols)
-                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-            }
-        }
-
-        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-            B<T> b(src.rows, src.cols);
-
-            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -1,196 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            __shared__ sum_t s_srcPatch[10][10];
-            __shared__ sum_t s_dstPatch[20][16];
-
-            if (threadIdx.x < 10 && threadIdx.y < 10)
-            {
-                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
-                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
-
-                srcx = ::abs(srcx);
-                srcx = ::min(src.cols - 1, srcx);
-
-                srcy = ::abs(srcy);
-                srcy = ::min(src.rows - 1, srcy);
-
-                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
-            }
-
-            __syncthreads();
-
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
-            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
-            const bool eveny = ((threadIdx.y & 1) == 0);
-            const int tidx = threadIdx.x;
-
-            if (eveny)
-            {
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
-                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
-            }
-
-            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
-
-            if (threadIdx.y < 2)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
-            }
-
-            if (threadIdx.y > 13)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
-            }
-
-            __syncthreads();
-
-            sum = VecTraits<sum_t>::all(0);
-
-            const int tidy = threadIdx.y;
-
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
-            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
-
-            if (x < dst.cols && y < dst.rows)
-                dst(y, x) = saturate_cast<T>(4.0f * sum);
-        }
-
-        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            pyrUp<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -1,274 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = mapx.ptr(y)[x];
-                const float ycoo = mapy.ptr(y)[x];
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
-            {
-                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
-                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
-            }
-        };
-
-        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
-
-            static const caller_t callers[3][5] =
-            {
-                {
-                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
-                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
-        }
-
-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -1,302 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <cfloat>
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
-
-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
-            }
-        }
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
-
-                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_resize_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type, srcWhole); \
-                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate< type > brd(src.rows, src.cols); \
-                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
-                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                if (stream == 0)
-                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
-                else
-                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> struct ResizeDispatcher<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                int iscale_x = (int)round(fx);
-                int iscale_y = (int)round(fy);
-
-                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
-                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
-                else
-                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-            PtrStepSzb dst, int interpolation, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
-
-            static const caller_t callers[4] =
-            {
-                ResizeDispatcher<PointFilter, T>::call,
-                ResizeDispatcher<LinearFilter, T>::call,
-                ResizeDispatcher<CubicFilter, T>::call,
-                ResizeDispatcher<AreaFilter, T>::call
-            };
-            // chenge to linear if area interpolation upscaling
-            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
-                interpolation = 1;
-
-            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template<typename T> struct scan_traits{};
-
-        template<> struct scan_traits<uchar>
-        {
-            typedef float scan_line_type;
-        };
-
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -1,389 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __constant__ float c_warpMat[3 * 3];
-
-        struct AffineTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        struct PerspectiveTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-
-                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////
-        // Build Maps
-
-        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < xmap.cols && y < xmap.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                xmap(y, x) = coord.x;
-                ymap(y, x) = coord.y;
-            }
-        }
-
-        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-
-            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
-        }
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////////
-        // Warp
-
-        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
-            }
-        }
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
-            {
-                (void)xoff;
-                (void)yoff;
-                (void)srcWhole;
-
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
-            {
-                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
-                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
-            }
-        };
-
-        template <class Transform, typename T>
-        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
-
-            static const func_t funcs[3][5] =
-            {
-                {
-                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
-        }
-
-        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
-        }
-
-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
-        }
-
-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cvt_color_internal.h
+++ b/modules/gpu/src/cvt_color_internal.h
@@ -1,274 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __cvt_color_internal_h__
-#define __cvt_color_internal_h__
-
-namespace cv { namespace gpu { namespace cudev
-{
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
-
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
-}}}
-
-#endif
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@@ -1,198 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::bilateralFilter(const GpuMat&, GpuMat&, int, float, float, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
-
-
-#else
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (brute force)
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
-
-        template<typename T>
-        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& s)
-{
-    using cv::gpu::cudev::imgproc::bilateral_filter_gpu;
-
-    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
-
-    static const func_t funcs[6][4] =
-    {
-        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
-        {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
-        {bilateral_filter_gpu<ushort>     , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3>     , bilateral_filter_gpu<ushort4>     },
-        {bilateral_filter_gpu<short>      , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3>      , bilateral_filter_gpu<short4>      },
-        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
-        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
-    };
-
-    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
-    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
-
-
-    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
-    kernel_size = std::max(radius, 1)*2 + 1;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(src.size(), src.type());
-    func(src, dst, kernel_size, sigma_spatial, sigma_color, gpuBorderType, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
-{
-    using cv::gpu::cudev::imgproc::nlm_bruteforce_gpu;
-    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
-
-    static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };
-
-    CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);
-
-    const func_t func = funcs[src.channels() - 1];
-    CV_Assert(func != 0);
-
-    int b = borderMode;
-    CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(src.size(), src.type());
-    func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (fast approxinate)
-
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
-
-        template<typename T>
-        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
-                          int search_window, int block_window, float h, cudaStream_t stream);
-
-        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream);
-        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream);
-     }
-}}}
-
-void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
-{
-    CV_Assert(src.depth() == CV_8U && src.channels() < 4);
-
-    int border_size = search_window/2 + block_window/2;
-    Size esize = src.size() + Size(border_size, border_size) * 2;
-
-    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
-    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
-
-    cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
-    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
-
-    int bcols, brows;
-    cudev::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
-    buffer.create(brows, bcols, CV_32S);
-
-    using namespace cv::gpu::cudev::imgproc;
-    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
-
-    dst.create(src.size(), src.type());
-    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC3);
-
-    lab.create(src.size(), src.type());
-    cv::gpu::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
-
-    l.create(src.size(), CV_8U);
-    ab.create(src.size(), CV_8UC2);
-    cudev::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
-
-    simpleMethod(l, l, h_luminance, search_window, block_window, s);
-    simpleMethod(ab, ab, h_color, search_window, block_window, s);
-
-    cudev::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
-    cv::gpu::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
-}
-
-
-#endif
-
-
--- a/modules/gpu/src/disparity_bilateral_filter.cpp
+++ b/modules/gpu/src/disparity_bilateral_filter.cpp
--- a/modules/gpu/src/gftt.cpp
+++ b/modules/gpu/src/gftt.cpp
@@ -1,169 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat&, GpuMat&, const GpuMat&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace gfft
-    {
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
-    }
-}}}
-
-void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image, GpuMat& corners, const GpuMat& mask)
-{
-    using namespace cv::gpu::cudev::gfft;
-
-    CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
-
-    ensureSizeIsEnough(image.size(), CV_32F, eig_);
-
-    if (useHarrisDetector)
-        cornerHarris(image, eig_, Dx_, Dy_, buf_, blockSize, 3, harrisK);
-    else
-        cornerMinEigenVal(image, eig_, Dx_, Dy_, buf_, blockSize, 3);
-
-    double maxVal = 0;
-    minMax(eig_, 0, &maxVal, GpuMat(), minMaxbuf_);
-
-    ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
-
-    int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
-
-    if (total == 0)
-    {
-        corners.release();
-        return;
-    }
-
-    sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
-
-    if (minDistance < 1)
-        tmpCorners_.colRange(0, maxCorners > 0 ? std::min(maxCorners, total) : total).copyTo(corners);
-    else
-    {
-        std::vector<Point2f> tmp(total);
-        Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
-        tmpCorners_.colRange(0, total).download(tmpMat);
-
-        std::vector<Point2f> tmp2;
-        tmp2.reserve(total);
-
-        const int cell_size = cvRound(minDistance);
-        const int grid_width = (image.cols + cell_size - 1) / cell_size;
-        const int grid_height = (image.rows + cell_size - 1) / cell_size;
-
-        std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
-
-        for (int i = 0; i < total; ++i)
-        {
-            Point2f p = tmp[i];
-
-            bool good = true;
-
-            int x_cell = static_cast<int>(p.x / cell_size);
-            int y_cell = static_cast<int>(p.y / cell_size);
-
-            int x1 = x_cell - 1;
-            int y1 = y_cell - 1;
-            int x2 = x_cell + 1;
-            int y2 = y_cell + 1;
-
-            // boundary check
-            x1 = std::max(0, x1);
-            y1 = std::max(0, y1);
-            x2 = std::min(grid_width - 1, x2);
-            y2 = std::min(grid_height - 1, y2);
-
-            for (int yy = y1; yy <= y2; yy++)
-            {
-                for (int xx = x1; xx <= x2; xx++)
-                {
-                    std::vector<Point2f>& m = grid[yy * grid_width + xx];
-
-                    if (!m.empty())
-                    {
-                        for(size_t j = 0; j < m.size(); j++)
-                        {
-                            float dx = p.x - m[j].x;
-                            float dy = p.y - m[j].y;
-
-                            if (dx * dx + dy * dy < minDistance * minDistance)
-                            {
-                                good = false;
-                                goto break_out;
-                            }
-                        }
-                    }
-                }
-            }
-
-            break_out:
-
-            if(good)
-            {
-                grid[y_cell * grid_width + x_cell].push_back(p);
-
-                tmp2.push_back(p);
-
-                if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
-                    break;
-            }
-        }
-
-        corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
-    }
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -1,282 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::connectivityMask(const GpuMat&, GpuMat&, const cv::Scalar&, const cv::Scalar&, Stream&) { throw_no_cuda(); }
-void cv::gpu::labelComponents(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace ccl
-    {
-        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream);
-
-        template<typename T>
-        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-    }
-}}}
-
-static float4 scalarToCudaType(const cv::Scalar& in)
-{
-  return make_float4((float)in[0], (float)in[1], (float)in[2], (float)in[3]);
-}
-
-void cv::gpu::connectivityMask(const GpuMat& image, GpuMat& mask, const cv::Scalar& lo, const cv::Scalar& hi, Stream& s)
-{
-    CV_Assert(!image.empty());
-
-    int ch = image.channels();
-    CV_Assert(ch <= 4);
-
-    int depth = image.depth();
-
-    typedef void (*func_t)(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-
-    static const func_t suppotLookup[8][4] =
-    {   //    1,    2,     3,     4
-        { cudev::ccl::computeEdges<uchar>,  0,  cudev::ccl::computeEdges<uchar3>,  cudev::ccl::computeEdges<uchar4>  },// CV_8U
-        { 0,                                 0,  0,                                  0                                  },// CV_16U
-        { cudev::ccl::computeEdges<ushort>, 0,  cudev::ccl::computeEdges<ushort3>, cudev::ccl::computeEdges<ushort4> },// CV_8S
-        { 0,                                 0,  0,                                  0                                  },// CV_16S
-        { cudev::ccl::computeEdges<int>,    0,  0,                                  0                                  },// CV_32S
-        { cudev::ccl::computeEdges<float>,  0,  0,                                  0                                  },// CV_32F
-        { 0,                                 0,  0,                                  0                                  },// CV_64F
-        { 0,                                 0,  0,                                  0                                  } // CV_USRTYPE1
-    };
-
-    func_t f = suppotLookup[depth][ch - 1];
-    CV_Assert(f);
-
-    if (image.size() != mask.size() || mask.type() != CV_8UC1)
-        mask.create(image.size(), CV_8UC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    float4 culo = scalarToCudaType(lo), cuhi = scalarToCudaType(hi);
-    f(image, mask, culo, cuhi, stream);
-}
-
-void cv::gpu::labelComponents(const GpuMat& mask, GpuMat& components, int flags, Stream& s)
-{
-    CV_Assert(!mask.empty() && mask.type() == CV_8U);
-
-    if (!deviceSupports(SHARED_ATOMICS))
-        CV_Error(cv::Error::StsNotImplemented, "The device doesn't support shared atomics and communicative synchronization!");
-
-    components.create(mask.size(), CV_32SC1);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    cudev::ccl::labelComponents(mask, components, flags, stream);
-}
-
-namespace
-{
-    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
-
-    class NppiGraphcutStateHandler
-    {
-    public:
-        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
-        {
-            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
-        }
-
-        ~NppiGraphcutStateHandler()
-        {
-            nppSafeCall( nppiGraphcutFree(pState) );
-        }
-
-        operator NppiGraphcutState*()
-        {
-            return pState;
-        }
-
-    private:
-        NppiGraphcutState* pState;
-    };
-}
-
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
-{
-#if (CUDA_VERSION < 5000)
-    CV_Assert(terminals.type() == CV_32S);
-#else
-    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
-#endif
-
-    Size src_size = terminals.size();
-
-    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(leftTransp.type() == terminals.type());
-
-    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(rightTransp.type() == terminals.type());
-
-    CV_Assert(top.size() == src_size);
-    CV_Assert(top.type() == terminals.type());
-
-    CV_Assert(bottom.size() == src_size);
-    CV_Assert(bottom.type() == terminals.type());
-
-    labels.create(src_size, CV_8U);
-
-    NppiSize sznpp;
-    sznpp.width = src_size.width;
-    sznpp.height = src_size.height;
-
-    int bufsz;
-    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
-
-    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
-
-#if (CUDA_VERSION < 5000)
-    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-#else
-    if (terminals.type() == CV_32S)
-    {
-        nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-    else
-    {
-        nppSafeCall( nppiGraphcut_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(), top.ptr<Npp32f>(), bottom.ptr<Npp32f>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-#endif
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight,
-              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
-{
-#if (CUDA_VERSION < 5000)
-    CV_Assert(terminals.type() == CV_32S);
-#else
-    CV_Assert(terminals.type() == CV_32S || terminals.type() == CV_32F);
-#endif
-
-    Size src_size = terminals.size();
-
-    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(leftTransp.type() == terminals.type());
-
-    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
-    CV_Assert(rightTransp.type() == terminals.type());
-
-    CV_Assert(top.size() == src_size);
-    CV_Assert(top.type() == terminals.type());
-
-    CV_Assert(topLeft.size() == src_size);
-    CV_Assert(topLeft.type() == terminals.type());
-
-    CV_Assert(topRight.size() == src_size);
-    CV_Assert(topRight.type() == terminals.type());
-
-    CV_Assert(bottom.size() == src_size);
-    CV_Assert(bottom.type() == terminals.type());
-
-    CV_Assert(bottomLeft.size() == src_size);
-    CV_Assert(bottomLeft.type() == terminals.type());
-
-    CV_Assert(bottomRight.size() == src_size);
-    CV_Assert(bottomRight.type() == terminals.type());
-
-    labels.create(src_size, CV_8U);
-
-    NppiSize sznpp;
-    sznpp.width = src_size.width;
-    sznpp.height = src_size.height;
-
-    int bufsz;
-    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
-
-    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    NppStreamHandler h(stream);
-
-    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
-
-#if (CUDA_VERSION < 5000)
-    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
-        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
-        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-#else
-    if (terminals.type() == CV_32S)
-    {
-        nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(),
-            top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
-            bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-    else
-    {
-        nppSafeCall( nppiGraphcut8_32f8u(terminals.ptr<Npp32f>(), leftTransp.ptr<Npp32f>(), rightTransp.ptr<Npp32f>(),
-            top.ptr<Npp32f>(), topLeft.ptr<Npp32f>(), topRight.ptr<Npp32f>(),
-            bottom.ptr<Npp32f>(), bottomLeft.ptr<Npp32f>(), bottomRight.ptr<Npp32f>(),
-            static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
-    }
-#endif
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@@ -1,439 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace match_template
-    {
-        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-
-        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
-
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
-            int cn, cudaStream_t stream);
-
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
-            int cn, cudaStream_t stream);
-
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_8UC2(
-            int w, int h,
-            const PtrStepSz<unsigned int> image_sum_r,
-            const PtrStepSz<unsigned int> image_sum_g,
-            unsigned int templ_sum_r,
-            unsigned int templ_sum_g,
-            PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
-                PtrStepSzf result, cudaStream_t stream);
-
-
-        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                int w, int h, const PtrStepSz<unsigned int> image_sum,
-                const PtrStepSz<unsigned long long> image_sqsum,
-                unsigned int templ_sum, unsigned long long templ_sqsum,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                PtrStepSzf result, cudaStream_t stream);
-        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                PtrStepSzf result, cudaStream_t stream);
-
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
-
-        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::match_template;
-
-namespace
-{
-
-    // Evaluates optimal template's area threshold. If
-    // template's area is less  than the threshold, we use naive match
-    // template version, otherwise FFT-based (if available)
-    int getTemplateThreshold(int method, int depth)
-    {
-        switch (method)
-        {
-        case cv::TM_CCORR:
-            if (depth == CV_32F) return 250;
-            if (depth == CV_8U) return 300;
-            break;
-        case cv::TM_SQDIFF:
-            if (depth == CV_8U) return 300;
-            break;
-        }
-        CV_Error(cv::Error::StsBadArg, "getTemplateThreshold: unsupported match template mode");
-        return 0;
-    }
-
-
-    void matchTemplate_CCORR_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_32F))
-        {
-            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-            return;
-        }
-
-        ConvolveBuf convolve_buf;
-        convolve_buf.user_block_size = buf.user_block_size;
-
-        if (image.channels() == 1)
-            convolve(image.reshape(1), templ.reshape(1), result, true, convolve_buf, stream);
-        else
-        {
-            GpuMat result_;
-            convolve(image.reshape(1), templ.reshape(1), result_, true, convolve_buf, stream);
-            extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
-        }
-    }
-
-
-    void matchTemplate_CCORR_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_CCORR, CV_8U))
-        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-            return;
-        }
-
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
-    }
-
-
-    void matchTemplate_CCORR_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-
-        buf.image_sqsums.resize(1);
-        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-
-        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-        normalize_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_SQDIFF_32F(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        (void)buf;
-        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_SQDIFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        if (templ.size().area() < getTemplateThreshold(cv::TM_SQDIFF, CV_8U))
-        {
-            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
-            return;
-        }
-
-        buf.image_sqsums.resize(1);
-        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-
-        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_SQDIFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        buf.image_sqsums.resize(1);
-        sqrIntegral(image.reshape(1), buf.image_sqsums[0], stream);
-
-        unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ.reshape(1))[0];
-
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, buf.image_sqsums[0], templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
-    }
-
-
-    void matchTemplate_CCOFF_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        matchTemplate_CCORR_8U(image, templ, result, buf, stream);
-
-        if (image.channels() == 1)
-        {
-            buf.image_sums.resize(1);
-            integral(image, buf.image_sums[0], stream);
-
-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
-            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, buf.image_sums[0], templ_sum, result, StreamAccessor::getStream(stream));
-        }
-        else
-        {
-            split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
-            for (int i = 0; i < image.channels(); ++i)
-                integral(buf.images[i], buf.image_sums[i], stream);
-
-            Scalar templ_sum = sum(templ);
-
-            switch (image.channels())
-            {
-            case 2:
-                matchTemplatePrepared_CCOFF_8UC2(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 3:
-                matchTemplatePrepared_CCOFF_8UC3(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 4:
-                matchTemplatePrepared_CCOFF_8UC4(
-                        templ.cols, templ.rows, buf.image_sums[0], buf.image_sums[1], buf.image_sums[2], buf.image_sums[3],
-                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
-                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
-                break;
-            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
-            }
-        }
-    }
-
-
-    void matchTemplate_CCOFF_NORMED_8U(
-            const GpuMat& image, const GpuMat& templ, GpuMat& result, MatchTemplateBuf &buf, Stream& stream)
-    {
-        if (stream)
-        {
-            stream.enqueueConvert(image, buf.imagef, CV_32F);
-            stream.enqueueConvert(templ, buf.templf, CV_32F);
-        }
-        else
-        {
-            image.convertTo(buf.imagef, CV_32F);
-            templ.convertTo(buf.templf, CV_32F);
-        }
-
-        matchTemplate_CCORR_32F(buf.imagef, buf.templf, result, buf, stream);
-
-        if (image.channels() == 1)
-        {
-            buf.image_sums.resize(1);
-            integral(image, buf.image_sums[0], stream);
-            buf.image_sqsums.resize(1);
-            sqrIntegral(image, buf.image_sqsums[0], stream);
-
-            unsigned int templ_sum = (unsigned int)sum(templ)[0];
-            unsigned long long templ_sqsum = (unsigned long long)sqrSum(templ)[0];
-
-            matchTemplatePrepared_CCOFF_NORMED_8U(
-                    templ.cols, templ.rows, buf.image_sums[0], buf.image_sqsums[0],
-                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
-        }
-        else
-        {
-            split(image, buf.images);
-            buf.image_sums.resize(buf.images.size());
-            buf.image_sqsums.resize(buf.images.size());
-            for (int i = 0; i < image.channels(); ++i)
-            {
-                integral(buf.images[i], buf.image_sums[i], stream);
-                sqrIntegral(buf.images[i], buf.image_sqsums[i], stream);
-            }
-
-            Scalar templ_sum = sum(templ);
-            Scalar templ_sqsum = sqrSum(templ);
-
-            switch (image.channels())
-            {
-            case 2:
-                matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 3:
-                matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            case 4:
-                matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                        templ.cols, templ.rows,
-                        buf.image_sums[0], buf.image_sqsums[0],
-                        buf.image_sums[1], buf.image_sqsums[1],
-                        buf.image_sums[2], buf.image_sqsums[2],
-                        buf.image_sums[3], buf.image_sqsums[3],
-                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
-                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
-                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
-                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
-                        result, StreamAccessor::getStream(stream));
-                break;
-            default:
-                CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported number of channels");
-            }
-        }
-    }
-}
-
-
-void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& result, int method, Stream& stream)
-{
-    MatchTemplateBuf buf;
-    matchTemplate(image, templ, result, method, buf, stream);
-}
-
-
-void cv::gpu::matchTemplate(
-        const GpuMat& image, const GpuMat& templ, GpuMat& result, int method,
-        MatchTemplateBuf &buf, Stream& stream)
-{
-    CV_Assert(image.type() == templ.type());
-    CV_Assert(image.cols >= templ.cols && image.rows >= templ.rows);
-
-    typedef void (*Caller)(const GpuMat&, const GpuMat&, GpuMat&, MatchTemplateBuf&, Stream& stream);
-
-    static const Caller callers8U[] = { ::matchTemplate_SQDIFF_8U, ::matchTemplate_SQDIFF_NORMED_8U,
-                                        ::matchTemplate_CCORR_8U, ::matchTemplate_CCORR_NORMED_8U,
-                                        ::matchTemplate_CCOFF_8U, ::matchTemplate_CCOFF_NORMED_8U };
-    static const Caller callers32F[] = { ::matchTemplate_SQDIFF_32F, 0,
-                                         ::matchTemplate_CCORR_32F, 0, 0, 0 };
-
-    const Caller* callers = 0;
-    switch (image.depth())
-    {
-        case CV_8U: callers = callers8U; break;
-        case CV_32F: callers = callers32F; break;
-        default: CV_Error(cv::Error::StsBadArg, "matchTemplate: unsupported data type");
-    }
-
-    Caller caller = callers[method];
-    CV_Assert(caller);
-    caller(image, templ, result, buf, stream);
-}
-
-#endif
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@@ -1,387 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::meanShiftSegmentation(const GpuMat&, Mat&, int, int, int, TermCriteria) { throw_no_cuda(); }
-
-#else
-
-// Auxiliray stuff
-namespace
-{
-
-//
-// Declarations
-//
-
-class DjSets
-{
-public:
-    DjSets(int n);
-    int find(int elem);
-    int merge(int set1, int set2);
-
-    std::vector<int> parent;
-    std::vector<int> rank;
-    std::vector<int> size;
-private:
-    DjSets(const DjSets&);
-    void operator =(const DjSets&);
-};
-
-
-template <typename T>
-struct GraphEdge
-{
-    GraphEdge() {}
-    GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
-    int to;
-    int next;
-    T val;
-};
-
-
-template <typename T>
-class Graph
-{
-public:
-    typedef GraphEdge<T> Edge;
-
-    Graph(int numv, int nume_max);
-
-    void addEdge(int from, int to, const T& val=T());
-
-    std::vector<int> start;
-    std::vector<Edge> edges;
-
-    int numv;
-    int nume_max;
-    int nume;
-private:
-    Graph(const Graph&);
-    void operator =(const Graph&);
-};
-
-
-struct SegmLinkVal
-{
-    SegmLinkVal() {}
-    SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
-    bool operator <(const SegmLinkVal& other) const
-    {
-        return dr + dsp < other.dr + other.dsp;
-    }
-    int dr;
-    int dsp;
-};
-
-
-struct SegmLink
-{
-    SegmLink() {}
-    SegmLink(int from_, int to_, const SegmLinkVal& val_)
-        : from(from_), to(to_), val(val_) {}
-    bool operator <(const SegmLink& other) const
-    {
-        return val < other.val;
-    }
-    int from;
-    int to;
-    SegmLinkVal val;
-};
-
-//
-// Implementation
-//
-
-DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
-{
-    for (int i = 0; i < n; ++i)
-        parent[i] = i;
-}
-
-
-inline int DjSets::find(int elem)
-{
-    int set = elem;
-    while (set != parent[set])
-        set = parent[set];
-    while (elem != parent[elem])
-    {
-        int next = parent[elem];
-        parent[elem] = set;
-        elem = next;
-    }
-    return set;
-}
-
-
-inline int DjSets::merge(int set1, int set2)
-{
-    if (rank[set1] < rank[set2])
-    {
-        parent[set1] = set2;
-        size[set2] += size[set1];
-        return set2;
-    }
-    if (rank[set2] < rank[set1])
-    {
-        parent[set2] = set1;
-        size[set1] += size[set2];
-        return set1;
-    }
-    parent[set1] = set2;
-    rank[set2]++;
-    size[set2] += size[set1];
-    return set2;
-}
-
-
-template <typename T>
-Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
-{
-    this->numv = numv_;
-    this->nume_max = nume_max_;
-    nume = 0;
-}
-
-
-template <typename T>
-inline void Graph<T>::addEdge(int from, int to, const T& val)
-{
-    edges[nume] = Edge(to, start[from], val);
-    start[from] = nume;
-    nume++;
-}
-
-
-inline int pix(int y, int x, int ncols)
-{
-    return y * ncols + x;
-}
-
-
-inline int sqr(int x)
-{
-    return x * x;
-}
-
-
-inline int dist2(const cv::Vec4b& lhs, const cv::Vec4b& rhs)
-{
-    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
-}
-
-
-inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
-{
-    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
-}
-
-} // anonymous namespace
-
-
-void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr, int minsize, TermCriteria criteria)
-{
-    CV_Assert(src.type() == CV_8UC4);
-    const int nrows = src.rows;
-    const int ncols = src.cols;
-    const int hr = sr;
-    const int hsp = sp;
-
-    // Perform mean shift procedure and obtain region and spatial maps
-    GpuMat d_rmap, d_spmap;
-    meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
-    Mat rmap(d_rmap);
-    Mat spmap(d_spmap);
-
-    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
-                                        + (nrows - 1) + (ncols - 1));
-
-    // Make region adjacent graph from image
-    Vec4b r1;
-    Vec4b r2[4];
-    Vec2s sp1;
-    Vec2s sp2[4];
-    int dr[4];
-    int dsp[4];
-    for (int y = 0; y < nrows - 1; ++y)
-    {
-        Vec4b* ry = rmap.ptr<Vec4b>(y);
-        Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
-        Vec2s* spy = spmap.ptr<Vec2s>(y);
-        Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
-        for (int x = 0; x < ncols - 1; ++x)
-        {
-            r1 = ry[x];
-            sp1 = spy[x];
-
-            r2[0] = ry[x + 1];
-            r2[1] = ryp[x];
-            r2[2] = ryp[x + 1];
-            r2[3] = ryp[x];
-
-            sp2[0] = spy[x + 1];
-            sp2[1] = spyp[x];
-            sp2[2] = spyp[x + 1];
-            sp2[3] = spyp[x];
-
-            dr[0] = dist2(r1, r2[0]);
-            dr[1] = dist2(r1, r2[1]);
-            dr[2] = dist2(r1, r2[2]);
-            dsp[0] = dist2(sp1, sp2[0]);
-            dsp[1] = dist2(sp1, sp2[1]);
-            dsp[2] = dist2(sp1, sp2[2]);
-
-            r1 = ry[x + 1];
-            sp1 = spy[x + 1];
-
-            dr[3] = dist2(r1, r2[3]);
-            dsp[3] = dist2(sp1, sp2[3]);
-
-            g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-            g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
-            g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
-            g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
-        }
-    }
-    for (int y = 0; y < nrows - 1; ++y)
-    {
-        r1 = rmap.at<Vec4b>(y, ncols - 1);
-        r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
-        sp1 = spmap.at<Vec2s>(y, ncols - 1);
-        sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
-        dr[0] = dist2(r1, r2[0]);
-        dsp[0] = dist2(sp1, sp2[0]);
-        g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-    }
-    for (int x = 0; x < ncols - 1; ++x)
-    {
-        r1 = rmap.at<Vec4b>(nrows - 1, x);
-        r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
-        sp1 = spmap.at<Vec2s>(nrows - 1, x);
-        sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
-        dr[0] = dist2(r1, r2[0]);
-        dsp[0] = dist2(sp1, sp2[0]);
-        g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
-    }
-
-    DjSets comps(g.numv);
-
-    // Find adjacent components
-    for (int v = 0; v < g.numv; ++v)
-    {
-        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
-        {
-            int c1 = comps.find(v);
-            int c2 = comps.find(g.edges[e_it].to);
-            if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
-                comps.merge(c1, c2);
-        }
-    }
-
-    std::vector<SegmLink> edges;
-    edges.reserve(g.numv);
-
-    // Prepare edges connecting differnet components
-    for (int v = 0; v < g.numv; ++v)
-    {
-        int c1 = comps.find(v);
-        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
-        {
-            int c2 = comps.find(g.edges[e_it].to);
-            if (c1 != c2)
-                edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
-        }
-    }
-
-    // Sort all graph's edges connecting differnet components (in asceding order)
-    sort(edges.begin(), edges.end());
-
-    // Exclude small components (starting from the nearest couple)
-    for (size_t i = 0; i < edges.size(); ++i)
-    {
-        int c1 = comps.find(edges[i].from);
-        int c2 = comps.find(edges[i].to);
-        if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
-            comps.merge(c1, c2);
-    }
-
-    // Compute sum of the pixel's colors which are in the same segment
-    Mat h_src(src);
-    std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
-    for (int y = 0; y < nrows; ++y)
-    {
-        Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
-        for (int x = 0; x < ncols; ++x)
-        {
-            int parent = comps.find(pix(y, x, ncols));
-            Vec4b col = h_srcy[x];
-            Vec4i& sumcol = sumcols[parent];
-            sumcol[0] += col[0];
-            sumcol[1] += col[1];
-            sumcol[2] += col[2];
-        }
-    }
-
-    // Create final image, color of each segment is the average color of its pixels
-    dst.create(src.size(), src.type());
-
-    for (int y = 0; y < nrows; ++y)
-    {
-        Vec4b* dsty = dst.ptr<Vec4b>(y);
-        for (int x = 0; x < ncols; ++x)
-        {
-            int parent = comps.find(pix(y, x, ncols));
-            const Vec4i& sumcol = sumcols[parent];
-            Vec4b& dstcol = dsty[x];
-            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
-            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
-            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
-            dstcol[3] = 255;
-        }
-    }
-}
-
-#endif // #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
@@ -1,249 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::ImagePyramid::build(const GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::ImagePyramid::getLayer(GpuMat&, Size, Stream&) const { throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-//////////////////////////////////////////////////////////////////////////////
-// pyrDown
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {pyrDown_gpu<uchar>      , 0 /*pyrDown_gpu<uchar2>*/ , pyrDown_gpu<uchar3>      , pyrDown_gpu<uchar4>      },
-        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
-        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
-        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
-        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
-        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-
-    func(src, dst, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// pyrUp
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {pyrUp_gpu<uchar>      , 0 /*pyrUp_gpu<uchar2>*/ , pyrUp_gpu<uchar3>      , pyrUp_gpu<uchar4>      },
-        {0 /*pyrUp_gpu<schar>*/, 0 /*pyrUp_gpu<schar2>*/ , 0 /*pyrUp_gpu<schar3>*/, 0 /*pyrUp_gpu<schar4>*/},
-        {pyrUp_gpu<ushort>     , 0 /*pyrUp_gpu<ushort2>*/, pyrUp_gpu<ushort3>     , pyrUp_gpu<ushort4>     },
-        {pyrUp_gpu<short>      , 0 /*pyrUp_gpu<short2>*/ , pyrUp_gpu<short3>      , pyrUp_gpu<short4>      },
-        {0 /*pyrUp_gpu<int>*/  , 0 /*pyrUp_gpu<int2>*/   , 0 /*pyrUp_gpu<int3>*/  , 0 /*pyrUp_gpu<int4>*/  },
-        {pyrUp_gpu<float>      , 0 /*pyrUp_gpu<float2>*/ , pyrUp_gpu<float3>      , pyrUp_gpu<float4>      }
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    dst.create(src.rows * 2, src.cols * 2, src.type());
-
-    func(src, dst, StreamAccessor::getStream(stream));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////
-// ImagePyramid
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace pyramid
-    {
-        template <typename T> void kernelDownsampleX2_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template <typename T> void kernelInterpolateFrom1_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::ImagePyramid::build(const GpuMat& img, int numLayers, Stream& stream)
-{
-    using namespace cv::gpu::cudev::pyramid;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {kernelDownsampleX2_gpu<uchar1>       , 0 /*kernelDownsampleX2_gpu<uchar2>*/ , kernelDownsampleX2_gpu<uchar3>      , kernelDownsampleX2_gpu<uchar4>      },
-        {0 /*kernelDownsampleX2_gpu<char1>*/  , 0 /*kernelDownsampleX2_gpu<char2>*/  , 0 /*kernelDownsampleX2_gpu<char3>*/ , 0 /*kernelDownsampleX2_gpu<char4>*/ },
-        {kernelDownsampleX2_gpu<ushort1>      , 0 /*kernelDownsampleX2_gpu<ushort2>*/, kernelDownsampleX2_gpu<ushort3>     , kernelDownsampleX2_gpu<ushort4>     },
-        {0 /*kernelDownsampleX2_gpu<short1>*/ , 0 /*kernelDownsampleX2_gpu<short2>*/ , 0 /*kernelDownsampleX2_gpu<short3>*/, 0 /*kernelDownsampleX2_gpu<short4>*/},
-        {0 /*kernelDownsampleX2_gpu<int1>*/   , 0 /*kernelDownsampleX2_gpu<int2>*/   , 0 /*kernelDownsampleX2_gpu<int3>*/  , 0 /*kernelDownsampleX2_gpu<int4>*/  },
-        {kernelDownsampleX2_gpu<float1>       , 0 /*kernelDownsampleX2_gpu<float2>*/ , kernelDownsampleX2_gpu<float3>      , kernelDownsampleX2_gpu<float4>      }
-    };
-
-    CV_Assert(img.depth() <= CV_32F && img.channels() <= 4);
-
-    const func_t func = funcs[img.depth()][img.channels() - 1];
-    CV_Assert(func != 0);
-
-    layer0_ = img;
-    Size szLastLayer = img.size();
-    nLayers_ = 1;
-
-    if (numLayers <= 0)
-        numLayers = 255; //it will cut-off when any of the dimensions goes 1
-
-    pyramid_.resize(numLayers);
-
-    for (int i = 0; i < numLayers - 1; ++i)
-    {
-        Size szCurLayer(szLastLayer.width / 2, szLastLayer.height / 2);
-
-        if (szCurLayer.width == 0 || szCurLayer.height == 0)
-            break;
-
-        ensureSizeIsEnough(szCurLayer, img.type(), pyramid_[i]);
-        nLayers_++;
-
-        const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
-
-        func(prevLayer, pyramid_[i], StreamAccessor::getStream(stream));
-
-        szLastLayer = szCurLayer;
-    }
-}
-
-void cv::gpu::ImagePyramid::getLayer(GpuMat& outImg, Size outRoi, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::pyramid;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-    static const func_t funcs[6][4] =
-    {
-        {kernelInterpolateFrom1_gpu<uchar1>      , 0 /*kernelInterpolateFrom1_gpu<uchar2>*/ , kernelInterpolateFrom1_gpu<uchar3>      , kernelInterpolateFrom1_gpu<uchar4>      },
-        {0 /*kernelInterpolateFrom1_gpu<char1>*/ , 0 /*kernelInterpolateFrom1_gpu<char2>*/  , 0 /*kernelInterpolateFrom1_gpu<char3>*/ , 0 /*kernelInterpolateFrom1_gpu<char4>*/ },
-        {kernelInterpolateFrom1_gpu<ushort1>     , 0 /*kernelInterpolateFrom1_gpu<ushort2>*/, kernelInterpolateFrom1_gpu<ushort3>     , kernelInterpolateFrom1_gpu<ushort4>     },
-        {0 /*kernelInterpolateFrom1_gpu<short1>*/, 0 /*kernelInterpolateFrom1_gpu<short2>*/ , 0 /*kernelInterpolateFrom1_gpu<short3>*/, 0 /*kernelInterpolateFrom1_gpu<short4>*/},
-        {0 /*kernelInterpolateFrom1_gpu<int1>*/  , 0 /*kernelInterpolateFrom1_gpu<int2>*/   , 0 /*kernelInterpolateFrom1_gpu<int3>*/  , 0 /*kernelInterpolateFrom1_gpu<int4>*/  },
-        {kernelInterpolateFrom1_gpu<float1>      , 0 /*kernelInterpolateFrom1_gpu<float2>*/ , kernelInterpolateFrom1_gpu<float3>      , kernelInterpolateFrom1_gpu<float4>      }
-    };
-
-    CV_Assert(outRoi.width <= layer0_.cols && outRoi.height <= layer0_.rows && outRoi.width > 0 && outRoi.height > 0);
-
-    ensureSizeIsEnough(outRoi, layer0_.type(), outImg);
-
-    const func_t func = funcs[outImg.depth()][outImg.channels() - 1];
-    CV_Assert(func != 0);
-
-    if (outRoi.width == layer0_.cols && outRoi.height == layer0_.rows)
-    {
-        if (stream)
-            stream.enqueueCopy(layer0_, outImg);
-        else
-            layer0_.copyTo(outImg);
-    }
-
-    float lastScale = 1.0f;
-    float curScale;
-    GpuMat lastLayer = layer0_;
-    GpuMat curLayer;
-
-    for (int i = 0; i < nLayers_ - 1; ++i)
-    {
-        curScale = lastScale * 0.5f;
-        curLayer = pyramid_[i];
-
-        if (outRoi.width == curLayer.cols && outRoi.height == curLayer.rows)
-        {
-            if (stream)
-                stream.enqueueCopy(curLayer, outImg);
-            else
-                curLayer.copyTo(outImg);
-        }
-
-        if (outRoi.width >= curLayer.cols && outRoi.height >= curLayer.rows)
-            break;
-
-        lastScale = curScale;
-        lastLayer = curLayer;
-    }
-
-    func(lastLayer, outImg, StreamAccessor::getStream(stream));
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
@@ -1,102 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, Scalar, Stream&){ throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T>
-        void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst,
-                       int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    }
-}}}
-
-void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, Scalar borderValue, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation,
-        int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-    static const func_t funcs[6][4] =
-    {
-        {remap_gpu<uchar>      , 0 /*remap_gpu<uchar2>*/ , remap_gpu<uchar3>     , remap_gpu<uchar4>     },
-        {0 /*remap_gpu<schar>*/, 0 /*remap_gpu<char2>*/  , 0 /*remap_gpu<char3>*/, 0 /*remap_gpu<char4>*/},
-        {remap_gpu<ushort>     , 0 /*remap_gpu<ushort2>*/, remap_gpu<ushort3>    , remap_gpu<ushort4>    },
-        {remap_gpu<short>      , 0 /*remap_gpu<short2>*/ , remap_gpu<short3>     , remap_gpu<short4>     },
-        {0 /*remap_gpu<int>*/  , 0 /*remap_gpu<int2>*/   , 0 /*remap_gpu<int3>*/ , 0 /*remap_gpu<int4>*/ },
-        {remap_gpu<float>      , 0 /*remap_gpu<float2>*/ , remap_gpu<float3>     , remap_gpu<float4>     }
-    };
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(xmap.size(), src.type());
-
-    Scalar_<float> borderValueFloat;
-    borderValueFloat = borderValue;
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, xmap, ymap,
-        dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(stream), deviceSupports(FEATURE_SET_COMPUTE_20));
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -1,162 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
-{
-    (void)src;
-    (void)dst;
-    (void)dsize;
-    (void)fx;
-    (void)fy;
-    (void)interpolation;
-    (void)s;
-
-    throw_no_cuda();
-}
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T>
-        void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-                        PtrStepSzb dst, int interpolation, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
-{
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR
-            || interpolation == INTER_CUBIC || interpolation == INTER_AREA);
-    CV_Assert(!(dsize == Size()) || (fx > 0 && fy > 0));
-
-    if (dsize == Size())
-        dsize = Size(saturate_cast<int>(src.cols * fx), saturate_cast<int>(src.rows * fy));
-    else
-    {
-        fx = static_cast<double>(dsize.width) / src.cols;
-        fy = static_cast<double>(dsize.height) / src.rows;
-    }
-    if (dsize != dst.size())
-        dst.create(dsize, src.type());
-
-    if (dsize == src.size())
-    {
-        if (s)
-            s.enqueueCopy(src, dst);
-        else
-            src.copyTo(dst);
-        return;
-    }
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    bool useNpp = (src.type() == CV_8UC1 || src.type() == CV_8UC4);
-    useNpp = useNpp && (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
-
-    if (useNpp)
-    {
-        typedef NppStatus (*func_t)(const Npp8u * pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, Npp8u * pDst, int nDstStep, NppiSize dstROISize,
-                                    double xFactor, double yFactor, int eInterpolation);
-
-        const func_t funcs[4] = { nppiResize_8u_C1R, 0, 0, nppiResize_8u_C4R };
-
-        static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC, 0, NPPI_INTER_LANCZOS};
-
-        NppiSize srcsz;
-        srcsz.width  = wholeSize.width;
-        srcsz.height = wholeSize.height;
-
-        NppiRect srcrect;
-        srcrect.x = ofs.x;
-        srcrect.y = ofs.y;
-        srcrect.width  = src.cols;
-        srcrect.height = src.rows;
-
-        NppiSize dstsz;
-        dstsz.width  = dst.cols;
-        dstsz.height = dst.rows;
-
-        NppStreamHandler h(stream);
-
-        nppSafeCall( funcs[src.channels() - 1](src.datastart, srcsz, static_cast<int>(src.step), srcrect,
-                dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstsz, fx, fy, npp_inter[interpolation]) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    else
-    {
-        using namespace ::cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        static const func_t funcs[6][4] =
-        {
-            {resize_gpu<uchar>      , 0 /*resize_gpu<uchar2>*/ , resize_gpu<uchar3>     , resize_gpu<uchar4>     },
-            {0 /*resize_gpu<schar>*/, 0 /*resize_gpu<char2>*/  , 0 /*resize_gpu<char3>*/, 0 /*resize_gpu<char4>*/},
-            {resize_gpu<ushort>     , 0 /*resize_gpu<ushort2>*/, resize_gpu<ushort3>    , resize_gpu<ushort4>    },
-            {resize_gpu<short>      , 0 /*resize_gpu<short2>*/ , resize_gpu<short3>     , resize_gpu<short4>     },
-            {0 /*resize_gpu<int>*/  , 0 /*resize_gpu<int2>*/   , 0 /*resize_gpu<int3>*/ , 0 /*resize_gpu<int4>*/ },
-            {resize_gpu<float>      , 0 /*resize_gpu<float2>*/ , resize_gpu<float3>     , resize_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y,
-            static_cast<float>(1.0 / fx), static_cast<float>(1.0 / fy), dst, interpolation, stream);
-    }
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
@@ -1,454 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-
-void cv::gpu::warpAffine(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpAffineMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::warpPerspective(const GpuMat&, GpuMat&, const Mat&, Size, int, int, Scalar, Stream&) { throw_no_cuda(); }
-void cv::gpu::buildWarpPerspectiveMaps(const Mat&, bool, Size, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else // HAVE_CUDA
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
-
-        template <typename T>
-        void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream);
-
-        template <typename T>
-        void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    }
-}}}
-
-void cv::gpu::buildWarpAffineMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(M.rows == 2 && M.cols == 3);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[2 * 3];
-    Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invertAffineTransform(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    buildWarpAffineMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::buildWarpPerspectiveMaps(const Mat& M, bool inverse, Size dsize, GpuMat& xmap, GpuMat& ymap, Stream& stream)
-{
-    using namespace cv::gpu::cudev::imgproc;
-
-    CV_Assert(M.rows == 3 && M.cols == 3);
-
-    xmap.create(dsize, CV_32FC1);
-    ymap.create(dsize, CV_32FC1);
-
-    float coeffs[3 * 3];
-    Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
-
-    if (inverse)
-        M.convertTo(coeffsMat, coeffsMat.type());
-    else
-    {
-        cv::Mat iM;
-        invert(M, iM);
-        iM.convertTo(coeffsMat, coeffsMat.type());
-    }
-
-    buildWarpPerspectiveMaps_gpu(coeffs, xmap, ymap, StreamAccessor::getStream(stream));
-}
-
-namespace
-{
-    template<int DEPTH> struct NppTypeTraits;
-    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
-    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
-    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
-    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; typedef Npp16sc npp_complex_type; };
-    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; typedef Npp32sc npp_complex_type; };
-    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; typedef Npp32fc npp_complex_type; };
-    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; typedef Npp64fc npp_complex_type; };
-
-    template <int DEPTH> struct NppWarpFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize srcSize, int srcStep, NppiRect srcRoi, npp_t* pDst,
-                                    int dstStep, NppiRect dstRoi, const double coeffs[][3],
-                                    int interpolation);
-    };
-
-    template <int DEPTH, typename NppWarpFunc<DEPTH>::func_t func> struct NppWarp
-    {
-        typedef typename NppWarpFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int interpolation, cudaStream_t stream)
-        {
-            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
-
-            NppiSize srcsz;
-            srcsz.height = src.rows;
-            srcsz.width = src.cols;
-
-            NppiRect srcroi;
-            srcroi.x = 0;
-            srcroi.y = 0;
-            srcroi.height = src.rows;
-            srcroi.width = src.cols;
-
-            NppiRect dstroi;
-            dstroi.x = 0;
-            dstroi.y = 0;
-            dstroi.height = dst.rows;
-            dstroi.width = dst.cols;
-
-            cv::gpu::NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
-                              dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi,
-                              coeffs, npp_inter[interpolation]) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::warpAffine(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
-{
-    CV_Assert(M.rows == 2 && M.cols == 3);
-
-    int interpolation = flags & INTER_MAX;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    dst.create(dsize, src.type());
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    static const bool useNppTab[6][4][3] =
-    {
-        {
-            {false, false, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        }
-    };
-
-    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    // NPP bug on float data
-    useNpp = useNpp && src.depth() != CV_32F;
-
-    if (useNpp)
-    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
-
-        static const func_t funcs[2][6][4] =
-        {
-            {
-                {NppWarp<CV_8U, nppiWarpAffine_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffine_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffine_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpAffine_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffine_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffine_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpAffine_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffine_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffine_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpAffine_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffine_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffine_32f_C4R>::call}
-            },
-            {
-                {NppWarp<CV_8U, nppiWarpAffineBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpAffineBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpAffineBack_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpAffineBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpAffineBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpAffineBack_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpAffineBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpAffineBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpAffineBack_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpAffineBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpAffineBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpAffineBack_32f_C4R>::call}
-            }
-        };
-
-        dst.setTo(borderValue);
-
-        double coeffs[2][3];
-        Mat coeffsMat(2, 3, CV_64F, (void*)coeffs);
-        M.convertTo(coeffsMat, coeffsMat.type());
-
-        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
-    }
-    else
-    {
-        using namespace cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        static const func_t funcs[6][4] =
-        {
-            {warpAffine_gpu<uchar>      , 0 /*warpAffine_gpu<uchar2>*/ , warpAffine_gpu<uchar3>     , warpAffine_gpu<uchar4>     },
-            {0 /*warpAffine_gpu<schar>*/, 0 /*warpAffine_gpu<char2>*/  , 0 /*warpAffine_gpu<char3>*/, 0 /*warpAffine_gpu<char4>*/},
-            {warpAffine_gpu<ushort>     , 0 /*warpAffine_gpu<ushort2>*/, warpAffine_gpu<ushort3>    , warpAffine_gpu<ushort4>    },
-            {warpAffine_gpu<short>      , 0 /*warpAffine_gpu<short2>*/ , warpAffine_gpu<short3>     , warpAffine_gpu<short4>     },
-            {0 /*warpAffine_gpu<int>*/  , 0 /*warpAffine_gpu<int2>*/   , 0 /*warpAffine_gpu<int3>*/ , 0 /*warpAffine_gpu<int4>*/ },
-            {warpAffine_gpu<float>      , 0 /*warpAffine_gpu<float2>*/ , warpAffine_gpu<float3>     , warpAffine_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        int gpuBorderType;
-        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-        float coeffs[2 * 3];
-        Mat coeffsMat(2, 3, CV_32F, (void*)coeffs);
-
-        if (flags & WARP_INVERSE_MAP)
-            M.convertTo(coeffsMat, coeffsMat.type());
-        else
-        {
-            cv::Mat iM;
-            invertAffineTransform(M, iM);
-            iM.convertTo(coeffsMat, coeffsMat.type());
-        }
-
-        Scalar_<float> borderValueFloat;
-        borderValueFloat = borderValue;
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
-    }
-}
-
-void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size dsize, int flags, int borderMode, Scalar borderValue, Stream& s)
-{
-    CV_Assert(M.rows == 3 && M.cols == 3);
-
-    int interpolation = flags & INTER_MAX;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    dst.create(dsize, src.type());
-
-    Size wholeSize;
-    Point ofs;
-    src.locateROI(wholeSize, ofs);
-
-    static const bool useNppTab[6][4][3] =
-    {
-        {
-            {false, false, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, false}
-        },
-        {
-            {false, false, false},
-            {false, false, false},
-            {false, false, false},
-            {false, false, false}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        },
-        {
-            {false, true, true},
-            {false, false, false},
-            {false, true, true},
-            {false, false, true}
-        }
-    };
-
-    bool useNpp = borderMode == BORDER_CONSTANT && ofs.x == 0 && ofs.y == 0 && useNppTab[src.depth()][src.channels() - 1][interpolation];
-    // NPP bug on float data
-    useNpp = useNpp && src.depth() != CV_32F;
-
-    if (useNpp)
-    {
-        typedef void (*func_t)(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, double coeffs[][3], int flags, cudaStream_t stream);
-
-        static const func_t funcs[2][6][4] =
-        {
-            {
-                {NppWarp<CV_8U, nppiWarpPerspective_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspective_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspective_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpPerspective_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspective_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspective_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpPerspective_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspective_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspective_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpPerspective_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspective_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspective_32f_C4R>::call}
-            },
-            {
-                {NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C1R>::call, 0, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C3R>::call, NppWarp<CV_8U, nppiWarpPerspectiveBack_8u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C1R>::call, 0, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C3R>::call, NppWarp<CV_16U, nppiWarpPerspectiveBack_16u_C4R>::call},
-                {0, 0, 0, 0},
-                {NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C1R>::call, 0, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C3R>::call, NppWarp<CV_32S, nppiWarpPerspectiveBack_32s_C4R>::call},
-                {NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C1R>::call, 0, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C3R>::call, NppWarp<CV_32F, nppiWarpPerspectiveBack_32f_C4R>::call}
-            }
-        };
-
-        dst.setTo(borderValue);
-
-        double coeffs[3][3];
-        Mat coeffsMat(3, 3, CV_64F, (void*)coeffs);
-        M.convertTo(coeffsMat, coeffsMat.type());
-
-        const func_t func = funcs[(flags & WARP_INVERSE_MAP) != 0][src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        func(src, dst, coeffs, interpolation, StreamAccessor::getStream(s));
-    }
-    else
-    {
-        using namespace cv::gpu::cudev::imgproc;
-
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-            int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        static const func_t funcs[6][4] =
-        {
-            {warpPerspective_gpu<uchar>      , 0 /*warpPerspective_gpu<uchar2>*/ , warpPerspective_gpu<uchar3>     , warpPerspective_gpu<uchar4>     },
-            {0 /*warpPerspective_gpu<schar>*/, 0 /*warpPerspective_gpu<char2>*/  , 0 /*warpPerspective_gpu<char3>*/, 0 /*warpPerspective_gpu<char4>*/},
-            {warpPerspective_gpu<ushort>     , 0 /*warpPerspective_gpu<ushort2>*/, warpPerspective_gpu<ushort3>    , warpPerspective_gpu<ushort4>    },
-            {warpPerspective_gpu<short>      , 0 /*warpPerspective_gpu<short2>*/ , warpPerspective_gpu<short3>     , warpPerspective_gpu<short4>     },
-            {0 /*warpPerspective_gpu<int>*/  , 0 /*warpPerspective_gpu<int2>*/   , 0 /*warpPerspective_gpu<int3>*/ , 0 /*warpPerspective_gpu<int4>*/ },
-            {warpPerspective_gpu<float>      , 0 /*warpPerspective_gpu<float2>*/ , warpPerspective_gpu<float3>     , warpPerspective_gpu<float4>     }
-        };
-
-        const func_t func = funcs[src.depth()][src.channels() - 1];
-        CV_Assert(func != 0);
-
-        int gpuBorderType;
-        CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-        float coeffs[3 * 3];
-        Mat coeffsMat(3, 3, CV_32F, (void*)coeffs);
-
-        if (flags & WARP_INVERSE_MAP)
-            M.convertTo(coeffsMat, coeffsMat.type());
-        else
-        {
-            cv::Mat iM;
-            invert(M, iM);
-            iM.convertTo(coeffsMat, coeffsMat.type());
-        }
-
-        Scalar_<float> borderValueFloat;
-        borderValueFloat = borderValue;
-
-        func(src, PtrStepSzb(wholeSize.height, wholeSize.width, src.datastart, src.step), ofs.x, ofs.y, coeffs,
-            dst, interpolation, gpuBorderType, borderValueFloat.val, StreamAccessor::getStream(s), deviceSupports(FEATURE_SET_COMPUTE_20));
-    }
-}
-
-#endif // HAVE_CUDA