Revert "Revert "Merge pull request #836 from jet47:gpu-modules""

2013-06-04 13:32:35 +04:00
parent 10340fe234
commit 3eeaa9189c
472 changed files with 29894 additions and 23019 deletions
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -1,565 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::magnitudeSqr(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-void cv::gpu::phase(const GpuMat&, const GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, Stream&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&) { throw_no_cuda(); }
-void cv::gpu::normalize(const GpuMat&, GpuMat&, double, double, int, int, const GpuMat&, GpuMat&, GpuMat&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-////////////////////////////////////////////////////////////////////////
-// gemm
-
-void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream)
-{
-#ifndef HAVE_CUBLAS
-    (void)src1;
-    (void)src2;
-    (void)alpha;
-    (void)src3;
-    (void)beta;
-    (void)dst;
-    (void)flags;
-    (void)stream;
-    CV_Error(cv::Error::StsNotImplemented, "The library was build without CUBLAS");
-#else
-    // CUBLAS works with column-major matrices
-
-    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
-    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
-
-    if (src1.depth() == CV_64F)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-    }
-
-    bool tr1 = (flags & GEMM_1_T) != 0;
-    bool tr2 = (flags & GEMM_2_T) != 0;
-    bool tr3 = (flags & GEMM_3_T) != 0;
-
-    if (src1.type() == CV_64FC2)
-    {
-        if (tr1 || tr2 || tr3)
-            CV_Error(cv::Error::StsNotImplemented, "transpose operation doesn't implemented for CV_64FC2 type");
-    }
-
-    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
-    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
-    Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size();
-    Size dstSize(src2Size.width, src1Size.height);
-
-    CV_Assert(src1Size.width == src2Size.height);
-    CV_Assert(src3.empty() || src3Size == dstSize);
-
-    dst.create(dstSize, src1.type());
-
-    if (beta != 0)
-    {
-        if (src3.empty())
-        {
-            if (stream)
-                stream.enqueueMemSet(dst, Scalar::all(0));
-            else
-                dst.setTo(Scalar::all(0));
-        }
-        else
-        {
-            if (tr3)
-            {
-                transpose(src3, dst, stream);
-            }
-            else
-            {
-                if (stream)
-                    stream.enqueueCopy(src3, dst);
-                else
-                    src3.copyTo(dst);
-            }
-        }
-    }
-
-    cublasHandle_t handle;
-    cublasSafeCall( cublasCreate_v2(&handle) );
-
-    cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) );
-
-    cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) );
-
-    const float alphaf = static_cast<float>(alpha);
-    const float betaf = static_cast<float>(beta);
-
-    const cuComplex alphacf = make_cuComplex(alphaf, 0);
-    const cuComplex betacf = make_cuComplex(betaf, 0);
-
-    const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0);
-    const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0);
-
-    cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-    switch (src1.type())
-    {
-    case CV_32FC1:
-        cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphaf,
-            src2.ptr<float>(), static_cast<int>(src2.step / sizeof(float)),
-            src1.ptr<float>(), static_cast<int>(src1.step / sizeof(float)),
-            &betaf,
-            dst.ptr<float>(), static_cast<int>(dst.step / sizeof(float))) );
-        break;
-
-    case CV_64FC1:
-        cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alpha,
-            src2.ptr<double>(), static_cast<int>(src2.step / sizeof(double)),
-            src1.ptr<double>(), static_cast<int>(src1.step / sizeof(double)),
-            &beta,
-            dst.ptr<double>(), static_cast<int>(dst.step / sizeof(double))) );
-        break;
-
-    case CV_32FC2:
-        cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphacf,
-            src2.ptr<cuComplex>(), static_cast<int>(src2.step / sizeof(cuComplex)),
-            src1.ptr<cuComplex>(), static_cast<int>(src1.step / sizeof(cuComplex)),
-            &betacf,
-            dst.ptr<cuComplex>(), static_cast<int>(dst.step / sizeof(cuComplex))) );
-        break;
-
-    case CV_64FC2:
-        cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows,
-            &alphac,
-            src2.ptr<cuDoubleComplex>(), static_cast<int>(src2.step / sizeof(cuDoubleComplex)),
-            src1.ptr<cuDoubleComplex>(), static_cast<int>(src1.step / sizeof(cuDoubleComplex)),
-            &betac,
-            dst.ptr<cuDoubleComplex>(), static_cast<int>(dst.step / sizeof(cuDoubleComplex))) );
-        break;
-    }
-
-    cublasSafeCall( cublasDestroy_v2(handle) );
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////
-// transpose
-
-void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
-{
-    CV_Assert(src.elemSize() == 1 || src.elemSize() == 4 || src.elemSize() == 8);
-
-    dst.create( src.cols, src.rows, src.type() );
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-
-    if (src.elemSize() == 1)
-    {
-        NppStreamHandler h(stream);
-
-        NppiSize sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        nppSafeCall( nppiTranspose_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz) );
-    }
-    else if (src.elemSize() == 4)
-    {
-        NppStStreamHandler h(stream);
-
-        NcvSize32u sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        ncvSafeCall( nppiStTranspose_32u_C1R(const_cast<Ncv32u*>(src.ptr<Ncv32u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv32u>(), static_cast<int>(dst.step), sz) );
-    }
-    else // if (src.elemSize() == 8)
-    {
-        if (!deviceSupports(NATIVE_DOUBLE))
-            CV_Error(cv::Error::StsUnsupportedFormat, "The device doesn't support double");
-
-        NppStStreamHandler h(stream);
-
-        NcvSize32u sz;
-        sz.width  = src.cols;
-        sz.height = src.rows;
-
-        ncvSafeCall( nppiStTranspose_64u_C1R(const_cast<Ncv64u*>(src.ptr<Ncv64u>()), static_cast<int>(src.step),
-            dst.ptr<Ncv64u>(), static_cast<int>(dst.step), sz) );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-////////////////////////////////////////////////////////////////////////
-// flip
-
-namespace
-{
-    template<int DEPTH> struct NppTypeTraits;
-    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
-    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
-    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
-    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
-    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
-    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
-    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
-
-    template <int DEPTH> struct NppMirrorFunc
-    {
-        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
-
-        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
-    };
-
-    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
-    {
-        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
-        {
-            NppStreamHandler h(stream);
-
-            NppiSize sz;
-            sz.width  = src.cols;
-            sz.height = src.rows;
-
-            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
-                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
-                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
-{
-    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
-    static const func_t funcs[6][4] =
-    {
-        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
-        {0,0,0,0},
-        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
-        {0,0,0,0},
-        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
-        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
-    };
-
-    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
-    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
-
-    dst.create(src.size(), src.type());
-
-    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// LUT
-
-void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
-{
-    const int cn = src.channels();
-
-    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_8UC3 );
-    CV_Assert( lut.depth() == CV_8U );
-    CV_Assert( lut.channels() == 1 || lut.channels() == cn );
-    CV_Assert( lut.rows * lut.cols == 256 && lut.isContinuous() );
-
-    dst.create(src.size(), CV_MAKE_TYPE(lut.depth(), cn));
-
-    NppiSize sz;
-    sz.height = src.rows;
-    sz.width = src.cols;
-
-    Mat nppLut;
-    lut.convertTo(nppLut, CV_32S);
-
-    int nValues3[] = {256, 256, 256};
-
-    Npp32s pLevels[256];
-    for (int i = 0; i < 256; ++i)
-        pLevels[i] = i;
-
-    const Npp32s* pLevels3[3];
-
-#if (CUDA_VERSION <= 4020)
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = pLevels;
-#else
-    GpuMat d_pLevels;
-    d_pLevels.upload(Mat(1, 256, CV_32S, pLevels));
-    pLevels3[0] = pLevels3[1] = pLevels3[2] = d_pLevels.ptr<Npp32s>();
-#endif
-
-    cudaStream_t stream = StreamAccessor::getStream(s);
-    NppStreamHandler h(stream);
-
-    if (src.type() == CV_8UC1)
-    {
-#if (CUDA_VERSION <= 4020)
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, nppLut.ptr<Npp32s>(), pLevels, 256) );
-#else
-        GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-        nppSafeCall( nppiLUT_Linear_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, d_nppLut.ptr<Npp32s>(), d_pLevels.ptr<Npp32s>(), 256) );
-#endif
-    }
-    else
-    {
-        const Npp32s* pValues3[3];
-
-        Mat nppLut3[3];
-        if (nppLut.channels() == 1)
-        {
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = pValues3[1] = pValues3[2] = nppLut.ptr<Npp32s>();
-#else
-            GpuMat d_nppLut(Mat(1, 256, CV_32S, nppLut.data));
-            pValues3[0] = pValues3[1] = pValues3[2] = d_nppLut.ptr<Npp32s>();
-#endif
-        }
-        else
-        {
-            cv::split(nppLut, nppLut3);
-
-#if (CUDA_VERSION <= 4020)
-            pValues3[0] = nppLut3[0].ptr<Npp32s>();
-            pValues3[1] = nppLut3[1].ptr<Npp32s>();
-            pValues3[2] = nppLut3[2].ptr<Npp32s>();
-#else
-            GpuMat d_nppLut0(Mat(1, 256, CV_32S, nppLut3[0].data));
-            GpuMat d_nppLut1(Mat(1, 256, CV_32S, nppLut3[1].data));
-            GpuMat d_nppLut2(Mat(1, 256, CV_32S, nppLut3[2].data));
-
-            pValues3[0] = d_nppLut0.ptr<Npp32s>();
-            pValues3[1] = d_nppLut1.ptr<Npp32s>();
-            pValues3[2] = d_nppLut2.ptr<Npp32s>();
-#endif
-        }
-
-        nppSafeCall( nppiLUT_Linear_8u_C3R(src.ptr<Npp8u>(), static_cast<int>(src.step),
-            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, pValues3, pLevels3, nValues3) );
-    }
-
-    if (stream == 0)
-        cudaSafeCall( cudaDeviceSynchronize() );
-}
-
-////////////////////////////////////////////////////////////////////////
-// NPP magnitide
-
-namespace
-{
-    typedef NppStatus (*nppMagnitude_t)(const Npp32fc* pSrc, int nSrcStep, Npp32f* pDst, int nDstStep, NppiSize oSizeROI);
-
-    inline void npp_magnitude(const GpuMat& src, GpuMat& dst, nppMagnitude_t func, cudaStream_t stream)
-    {
-        CV_Assert(src.type() == CV_32FC2);
-
-        dst.create(src.size(), CV_32FC1);
-
-        NppiSize sz;
-        sz.width = src.cols;
-        sz.height = src.rows;
-
-        NppStreamHandler h(stream);
-
-        nppSafeCall( func(src.ptr<Npp32fc>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-void cv::gpu::magnitude(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    npp_magnitude(src, dst, nppiMagnitude_32fc32f_C1R, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
-{
-    npp_magnitude(src, dst, nppiMagnitudeSqr_32fc32f_C1R, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// Polar <-> Cart
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mathfunc
-    {
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-    }
-}}}
-
-namespace
-{
-    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::mathfunc;
-
-        CV_Assert(x.size() == y.size() && x.type() == y.type());
-        CV_Assert(x.depth() == CV_32F);
-
-        if (mag)
-            mag->create(x.size(), x.type());
-        if (angle)
-            angle->create(x.size(), x.type());
-
-        GpuMat x1cn = x.reshape(1);
-        GpuMat y1cn = y.reshape(1);
-        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
-        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();
-
-        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
-    }
-
-    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
-    {
-        using namespace ::cv::gpu::cudev::mathfunc;
-
-        CV_Assert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
-        CV_Assert(mag.depth() == CV_32F);
-
-        x.create(mag.size(), mag.type());
-        y.create(mag.size(), mag.type());
-
-        GpuMat mag1cn = mag.reshape(1);
-        GpuMat angle1cn = angle.reshape(1);
-        GpuMat x1cn = x.reshape(1);
-        GpuMat y1cn = y.reshape(1);
-
-        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
-    }
-}
-
-void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
-{
-    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
-{
-    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
-{
-    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
-{
-    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
-{
-    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
-}
-
-////////////////////////////////////////////////////////////////////////
-// normalize
-
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask)
-{
-    GpuMat norm_buf;
-    GpuMat cvt_buf;
-    normalize(src, dst, a, b, norm_type, dtype, mask, norm_buf, cvt_buf);
-}
-
-void cv::gpu::normalize(const GpuMat& src, GpuMat& dst, double a, double b, int norm_type, int dtype, const GpuMat& mask, GpuMat& norm_buf, GpuMat& cvt_buf)
-{
-    double scale = 1, shift = 0;
-    if (norm_type == NORM_MINMAX)
-    {
-        double smin = 0, smax = 0;
-        double dmin = std::min(a, b), dmax = std::max(a, b);
-        minMax(src, &smin, &smax, mask, norm_buf);
-        scale = (dmax - dmin) * (smax - smin > std::numeric_limits<double>::epsilon() ? 1.0 / (smax - smin) : 0.0);
-        shift = dmin - smin * scale;
-    }
-    else if (norm_type == NORM_L2 || norm_type == NORM_L1 || norm_type == NORM_INF)
-    {
-        scale = norm(src, norm_type, mask, norm_buf);
-        scale = scale > std::numeric_limits<double>::epsilon() ? a / scale : 0.0;
-        shift = 0;
-    }
-    else
-    {
-        CV_Error(cv::Error::StsBadArg, "Unknown/unsupported norm type");
-    }
-
-    if (mask.empty())
-    {
-        src.convertTo(dst, dtype, scale, shift);
-    }
-    else
-    {
-        src.convertTo(cvt_buf, dtype, scale, shift);
-        cvt_buf.copyTo(dst, mask);
-    }
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@@ -1,168 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::GMG_GPU::GMG_GPU() { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::initialize(cv::Size, float, float) { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, cv::gpu::Stream&) { throw_no_cuda(); }
-void cv::gpu::GMG_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace bgfg_gmg
-    {
-        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
-                           float decisionThreshold, int maxFeatures, int numInitializationFrames);
-
-        template <typename SrcT>
-        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    }
-}}}
-
-cv::gpu::GMG_GPU::GMG_GPU()
-{
-    maxFeatures = 64;
-    learningRate = 0.025f;
-    numInitializationFrames = 120;
-    quantizationLevels = 16;
-    backgroundPrior = 0.8f;
-    decisionThreshold = 0.8f;
-    smoothingRadius = 7;
-    updateBackgroundModel = true;
-}
-
-void cv::gpu::GMG_GPU::initialize(cv::Size frameSize, float min, float max)
-{
-    using namespace cv::gpu::cudev::bgfg_gmg;
-
-    CV_Assert(min < max);
-    CV_Assert(maxFeatures > 0);
-    CV_Assert(learningRate >= 0.0f && learningRate <= 1.0f);
-    CV_Assert(numInitializationFrames >= 1);
-    CV_Assert(quantizationLevels >= 1 && quantizationLevels <= 255);
-    CV_Assert(backgroundPrior >= 0.0f && backgroundPrior <= 1.0f);
-
-    minVal_ = min;
-    maxVal_ = max;
-
-    frameSize_ = frameSize;
-
-    frameNum_ = 0;
-
-    nfeatures_.create(frameSize_, CV_32SC1);
-    colors_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32SC1);
-    weights_.create(maxFeatures * frameSize_.height, frameSize_.width, CV_32FC1);
-
-    nfeatures_.setTo(cv::Scalar::all(0));
-
-    if (smoothingRadius > 0)
-        boxFilter_ = cv::gpu::createBoxFilter_GPU(CV_8UC1, CV_8UC1, cv::Size(smoothingRadius, smoothingRadius));
-
-    loadConstants(frameSize_.width, frameSize_.height, minVal_, maxVal_, quantizationLevels, backgroundPrior, decisionThreshold, maxFeatures, numInitializationFrames);
-}
-
-void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float newLearningRate, cv::gpu::Stream& stream)
-{
-    using namespace cv::gpu::cudev::bgfg_gmg;
-
-    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    static const func_t funcs[6][4] =
-    {
-        {update_gpu<uchar>, 0, update_gpu<uchar3>, update_gpu<uchar4>},
-        {0,0,0,0},
-        {update_gpu<ushort>, 0, update_gpu<ushort3>, update_gpu<ushort4>},
-        {0,0,0,0},
-        {0,0,0,0},
-        {update_gpu<float>, 0, update_gpu<float3>, update_gpu<float4>}
-    };
-
-    CV_Assert(frame.depth() == CV_8U || frame.depth() == CV_16U || frame.depth() == CV_32F);
-    CV_Assert(frame.channels() == 1 || frame.channels() == 3 || frame.channels() == 4);
-
-    if (newLearningRate != -1.0f)
-    {
-        CV_Assert(newLearningRate >= 0.0f && newLearningRate <= 1.0f);
-        learningRate = newLearningRate;
-    }
-
-    if (frame.size() != frameSize_)
-        initialize(frame.size(), 0.0f, frame.depth() == CV_8U ? 255.0f : frame.depth() == CV_16U ? std::numeric_limits<ushort>::max() : 1.0f);
-
-    fgmask.create(frameSize_, CV_8UC1);
-    if (stream)
-        stream.enqueueMemSet(fgmask, cv::Scalar::all(0));
-    else
-        fgmask.setTo(cv::Scalar::all(0));
-
-    funcs[frame.depth()][frame.channels() - 1](frame, fgmask, colors_, weights_, nfeatures_, frameNum_, learningRate, updateBackgroundModel, cv::gpu::StreamAccessor::getStream(stream));
-
-    // medianBlur
-    if (smoothingRadius > 0)
-    {
-        boxFilter_->apply(fgmask, buf_, cv::Rect(0,0,-1,-1), stream);
-        int minCount = (smoothingRadius * smoothingRadius + 1) / 2;
-        double thresh = 255.0 * minCount / (smoothingRadius * smoothingRadius);
-        cv::gpu::threshold(buf_, fgmask, thresh, 255.0, cv::THRESH_BINARY, stream);
-    }
-
-    // keep track of how many frames we have processed
-    ++frameNum_;
-}
-
-void cv::gpu::GMG_GPU::release()
-{
-    frameSize_ = Size();
-
-    nfeatures_.release();
-    colors_.release();
-    weights_.release();
-    boxFilter_.release();
-    buf_.release();
-}
-
-#endif
--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@@ -1,279 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-cv::gpu::MOG_GPU::MOG_GPU(int) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, float, Stream&) { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
-void cv::gpu::MOG_GPU::release() {}
-
-cv::gpu::MOG2_GPU::MOG2_GPU(int) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::initialize(cv::Size, int) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::operator()(const GpuMat&, GpuMat&, float, Stream&) { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat&, Stream&) const { throw_no_cuda(); }
-void cv::gpu::MOG2_GPU::release() {}
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mog
-    {
-        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
-                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
-                     cudaStream_t stream);
-        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
-
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-    }
-}}}
-
-namespace mog
-{
-    const int defaultNMixtures = 5;
-    const int defaultHistory = 200;
-    const float defaultBackgroundRatio = 0.7f;
-    const float defaultVarThreshold = 2.5f * 2.5f;
-    const float defaultNoiseSigma = 30.0f * 0.5f;
-    const float defaultInitialWeight = 0.05f;
-}
-
-cv::gpu::MOG_GPU::MOG_GPU(int nmixtures) :
-    frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = std::min(nmixtures > 0 ? nmixtures : mog::defaultNMixtures, 8);
-    history = mog::defaultHistory;
-    varThreshold = mog::defaultVarThreshold;
-    backgroundRatio = mog::defaultBackgroundRatio;
-    noiseSigma = mog::defaultNoiseSigma;
-}
-
-void cv::gpu::MOG_GPU::initialize(cv::Size frameSize, int frameType)
-{
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store
-    // the mixture sort key (w/sum_of_variances), the mixture weight (w),
-    // the mean (nchannels values) and
-    // the diagonal covariance matrix (another nchannels values)
-
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    sortKey_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-    var_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    weight_.setTo(cv::Scalar::all(0));
-    sortKey_.setTo(cv::Scalar::all(0));
-    mean_.setTo(cv::Scalar::all(0));
-    var_.setTo(cv::Scalar::all(0));
-
-    nframes_ = 0;
-}
-
-void cv::gpu::MOG_GPU::operator()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat& fgmask, float learningRate, Stream& stream)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    CV_Assert(frame.depth() == CV_8U);
-
-    int ch = frame.channels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0 || frame.size() != frameSize_ || work_ch != mean_.channels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog_gpu(frame, ch, fgmask, weight_, sortKey_, mean_, var_, nmixtures_,
-            varThreshold, learningRate, backgroundRatio, noiseSigma,
-            StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    getBackgroundImage_gpu(backgroundImage.channels(), weight_, mean_, backgroundImage, nmixtures_, backgroundRatio, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    sortKey_.release();
-    mean_.release();
-    var_.release();
-}
-
-/////////////////////////////////////////////////////////////////
-// MOG2
-
-namespace mog2
-{
-    // default parameters of gaussian background detection algorithm
-    const int defaultHistory = 500; // Learning rate; alpha = 1/defaultHistory2
-    const float defaultVarThreshold = 4.0f * 4.0f;
-    const int defaultNMixtures = 5; // maximal number of Gaussians in mixture
-    const float defaultBackgroundRatio = 0.9f; // threshold sum of weights for background test
-    const float defaultVarThresholdGen = 3.0f * 3.0f;
-    const float defaultVarInit = 15.0f; // initial variance for new components
-    const float defaultVarMax = 5.0f * defaultVarInit;
-    const float defaultVarMin = 4.0f;
-
-    // additional parameters
-    const float defaultfCT = 0.05f; // complexity reduction prior constant 0 - no reduction of number of components
-    const unsigned char defaultnShadowDetection = 127; // value to use in the segmentation mask for shadows, set 0 not to do shadow detection
-    const float defaultfTau = 0.5f; // Tau - shadow threshold, see the paper for explanation
-}
-
-cv::gpu::MOG2_GPU::MOG2_GPU(int nmixtures) :
-    frameSize_(0, 0), frameType_(0), nframes_(0)
-{
-    nmixtures_ = nmixtures > 0 ? nmixtures : mog2::defaultNMixtures;
-
-    history = mog2::defaultHistory;
-    varThreshold = mog2::defaultVarThreshold;
-    bShadowDetection = true;
-
-    backgroundRatio = mog2::defaultBackgroundRatio;
-    fVarInit = mog2::defaultVarInit;
-    fVarMax  = mog2::defaultVarMax;
-    fVarMin = mog2::defaultVarMin;
-
-    varThresholdGen = mog2::defaultVarThresholdGen;
-    fCT = mog2::defaultfCT;
-    nShadowDetection =  mog2::defaultnShadowDetection;
-    fTau = mog2::defaultfTau;
-}
-
-void cv::gpu::MOG2_GPU::initialize(cv::Size frameSize, int frameType)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    CV_Assert(frameType == CV_8UC1 || frameType == CV_8UC3 || frameType == CV_8UC4);
-
-    frameSize_ = frameSize;
-    frameType_ = frameType;
-    nframes_ = 0;
-
-    int ch = CV_MAT_CN(frameType);
-    int work_ch = ch;
-
-    // for each gaussian mixture of each pixel bg model we store ...
-    // the mixture weight (w),
-    // the mean (nchannels values) and
-    // the covariance
-    weight_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    variance_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC1);
-    mean_.create(frameSize.height * nmixtures_, frameSize_.width, CV_32FC(work_ch));
-
-    //make the array for keeping track of the used modes per pixel - all zeros at start
-    bgmodelUsedModes_.create(frameSize_, CV_8UC1);
-    bgmodelUsedModes_.setTo(cv::Scalar::all(0));
-
-    loadConstants(nmixtures_, varThreshold, backgroundRatio, varThresholdGen, fVarInit, fVarMin, fVarMax, fTau, nShadowDetection);
-}
-
-void cv::gpu::MOG2_GPU::operator()(const GpuMat& frame, GpuMat& fgmask, float learningRate, Stream& stream)
-{
-    using namespace cv::gpu::cudev::mog;
-
-    int ch = frame.channels();
-    int work_ch = ch;
-
-    if (nframes_ == 0 || learningRate >= 1.0f || frame.size() != frameSize_ || work_ch != mean_.channels())
-        initialize(frame.size(), frame.type());
-
-    fgmask.create(frameSize_, CV_8UC1);
-    fgmask.setTo(cv::Scalar::all(0));
-
-    ++nframes_;
-    learningRate = learningRate >= 0.0f && nframes_ > 1 ? learningRate : 1.0f / std::min(2 * nframes_, history);
-    CV_Assert(learningRate >= 0.0f);
-
-    mog2_gpu(frame, frame.channels(), fgmask, bgmodelUsedModes_, weight_, variance_, mean_, learningRate, -learningRate * fCT, bShadowDetection, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG2_GPU::getBackgroundImage(GpuMat& backgroundImage, Stream& stream) const
-{
-    using namespace cv::gpu::cudev::mog;
-
-    backgroundImage.create(frameSize_, frameType_);
-
-    getBackgroundImage2_gpu(backgroundImage.channels(), bgmodelUsedModes_, weight_, mean_, backgroundImage, StreamAccessor::getStream(stream));
-}
-
-void cv::gpu::MOG2_GPU::release()
-{
-    frameSize_ = Size(0, 0);
-    frameType_ = 0;
-    nframes_ = 0;
-
-    weight_.release();
-    variance_.release();
-    mean_.release();
-
-    bgmodelUsedModes_.release();
-}
-
-#endif
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -1,157 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int) { throw_no_cuda(); }
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int, int, int, float, float, float) { throw_no_cuda(); }
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else /* !defined (HAVE_CUDA) */
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace disp_bilateral_filter
-    {
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
-
-        template<typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::disp_bilateral_filter;
-
-namespace
-{
-    const float DEFAULT_EDGE_THRESHOLD = 0.1f;
-    const float DEFAULT_MAX_DISC_THRESHOLD = 0.2f;
-    const float DEFAULT_SIGMA_RANGE = 10.0f;
-
-    inline void calc_color_weighted_table(GpuMat& table_color, float sigma_range, int len)
-    {
-        Mat cpu_table_color(1, len, CV_32F);
-
-        float* line = cpu_table_color.ptr<float>();
-
-        for(int i = 0; i < len; i++)
-            line[i] = static_cast<float>(std::exp(-double(i * i) / (2 * sigma_range * sigma_range)));
-
-        table_color.upload(cpu_table_color);
-    }
-
-    inline void calc_space_weighted_filter(GpuMat& table_space, int win_size, float dist_space)
-    {
-        int half = (win_size >> 1);
-
-        Mat cpu_table_space(half + 1, half + 1, CV_32F);
-
-        for (int y = 0; y <= half; ++y)
-        {
-            float* row = cpu_table_space.ptr<float>(y);
-            for (int x = 0; x <= half; ++x)
-                row[x] = exp(-sqrt(float(y * y) + float(x * x)) / dist_space);
-        }
-
-        table_space.upload(cpu_table_space);
-    }
-
-    template <typename T>
-    void disp_bilateral_filter_operator(int ndisp, int radius, int iters, float edge_threshold,float max_disc_threshold,
-                                   GpuMat& table_color, GpuMat& table_space,
-                                   const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-    {
-        short edge_disc = std::max<short>(short(1), short(ndisp * edge_threshold + 0.5));
-        short max_disc = short(ndisp * max_disc_threshold + 0.5);
-
-        disp_load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
-
-        if (&dst != &disp)
-        {
-            if (stream)
-                stream.enqueueCopy(disp, dst);
-            else
-                disp.copyTo(dst);
-        }
-
-        disp_bilateral_filter<T>(dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
-    }
-
-    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold,
-                                                GpuMat& table_color, GpuMat& table_space,
-                                                const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream);
-
-    const bilateral_filter_operator_t operators[] =
-        {disp_bilateral_filter_operator<unsigned char>, 0, 0, disp_bilateral_filter_operator<short>, 0, 0, 0, 0};
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(DEFAULT_EDGE_THRESHOLD), max_disc_threshold(DEFAULT_MAX_DISC_THRESHOLD),
-      sigma_range(DEFAULT_SIGMA_RANGE)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-cv::gpu::DisparityBilateralFilter::DisparityBilateralFilter(int ndisp_, int radius_, int iters_, float edge_threshold_,
-                                                     float max_disc_threshold_, float sigma_range_)
-    : ndisp(ndisp_), radius(radius_), iters(iters_), edge_threshold(edge_threshold_), max_disc_threshold(max_disc_threshold_),
-      sigma_range(sigma_range_)
-{
-    calc_color_weighted_table(table_color, sigma_range, 255);
-    calc_space_weighted_filter(table_space, radius * 2 + 1, radius + 1.0f);
-}
-
-void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat& disp, const GpuMat& img, GpuMat& dst, Stream& stream)
-{
-    CV_DbgAssert(0 < ndisp && 0 < radius && 0 < iters);
-    CV_Assert(disp.rows == img.rows && disp.cols == img.cols && (disp.type() == CV_8U || disp.type() == CV_16S) && (img.type() == CV_8UC1 || img.type() == CV_8UC3));
-    operators[disp.type()](ndisp, radius, iters, edge_threshold, max_disc_threshold, table_color, table_space, disp, img, dst, stream);
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@@ -1,99 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
-
-#else
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace blend
-    {
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
-    }
-}}}
-
-using namespace ::cv::gpu::cudev::blend;
-
-void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2,
-                          GpuMat& result, Stream& stream)
-{
-    CV_Assert(img1.size() == img2.size());
-    CV_Assert(img1.type() == img2.type());
-    CV_Assert(weights1.size() == img1.size());
-    CV_Assert(weights2.size() == img2.size());
-    CV_Assert(weights1.type() == CV_32F);
-    CV_Assert(weights2.type() == CV_32F);
-
-    const Size size = img1.size();
-    const int depth = img1.depth();
-    const int cn = img1.channels();
-
-    result.create(size, CV_MAKE_TYPE(depth, cn));
-
-    switch (depth)
-    {
-    case CV_8U:
-        if (cn != 4)
-            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        else
-            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    case CV_32F:
-        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
-        break;
-    default:
-        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
-    }
-}
-
-#endif
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -252,7 +252,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
    // Find the best hypothesis index
    Point best_idx;
    double best_score;
-    minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
+    gpu::minMaxLoc(d_hypothesis_scores, NULL, &best_score, NULL, &best_idx);
    int num_inliers = static_cast<int>(best_score);

    // Extract the best hypothesis data
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -41,8 +41,6 @@
 //M*/

 #include "precomp.hpp"
-#include <vector>
-#include <iostream>
 #include "opencv2/objdetect/objdetect_c.h"

 using namespace cv;
@@ -75,6 +73,37 @@ public:
    virtual bool read(const String& classifierAsXml) = 0;
 };

+#ifndef HAVE_OPENCV_GPULEGACY
+
+struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
+{
+public:
+    HaarCascade()
+    {
+        throw_no_cuda();
+    }
+
+    unsigned int process(const GpuMat&, GpuMat&, float, int, bool, bool, cv::Size, cv::Size)
+    {
+        throw_no_cuda();
+        return 0;
+    }
+
+    cv::Size getClassifierCvSize() const
+    {
+        throw_no_cuda();
+        return cv::Size();
+    }
+
+    bool read(const String&)
+    {
+        throw_no_cuda();
+        return false;
+    }
+};
+
+#else
+
 struct cv::gpu::CascadeClassifier_GPU::HaarCascade : cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl
 {
 public:
@@ -284,6 +313,8 @@ private:
    virtual ~HaarCascade(){}
 };

+#endif
+
 cv::Size operator -(const cv::Size& a, const cv::Size& b)
 {
    return cv::Size(a.width - b.width, a.height - b.height);
@@ -477,6 +508,8 @@ private:
            resuzeBuffer.create(frame, CV_8UC1);

            integral.create(frame.height + 1, integralFactor * (frame.width + 1), CV_32SC1);
+
+#ifdef HAVE_OPENCV_GPULEGACY
            NcvSize32u roiSize;
            roiSize.width = frame.width;
            roiSize.height = frame.height;
@@ -487,6 +520,7 @@ private:
            Ncv32u bufSize;
            ncvSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) );
            integralBuffer.create(1, bufSize, CV_8UC1);
+#endif

            candidates.create(1 , frame.width >> 1, CV_32SC4);
        }
@@ -722,240 +756,3 @@ bool cv::gpu::CascadeClassifier_GPU::load(const String& filename)
 }

 #endif
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////
-
-#if defined (HAVE_CUDA)
-
-struct RectConvert
-{
-    Rect operator()(const NcvRect32u& nr) const { return Rect(nr.x, nr.y, nr.width, nr.height); }
-    NcvRect32u operator()(const Rect& nr) const
-    {
-        NcvRect32u rect;
-        rect.x = nr.x;
-        rect.y = nr.y;
-        rect.width = nr.width;
-        rect.height = nr.height;
-        return rect;
-    }
-};
-
-void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights)
-{
-    std::vector<Rect> rects(hypotheses.size());
-    std::transform(hypotheses.begin(), hypotheses.end(), rects.begin(), RectConvert());
-
-    if (weights)
-    {
-        std::vector<int> weights_int;
-        weights_int.assign(weights->begin(), weights->end());
-        cv::groupRectangles(rects, weights_int, groupThreshold, eps);
-    }
-    else
-    {
-        cv::groupRectangles(rects, groupThreshold, eps);
-    }
-    std::transform(rects.begin(), rects.end(), hypotheses.begin(), RectConvert());
-    hypotheses.resize(rects.size());
-}
-
-NCVStatus loadFromXML(const String &filename,
-                      HaarClassifierCascadeDescriptor &haar,
-                      std::vector<HaarStage64> &haarStages,
-                      std::vector<HaarClassifierNode128> &haarClassifierNodes,
-                      std::vector<HaarFeature64> &haarFeatures)
-{
-    NCVStatus ncvStat;
-
-    haar.NumStages = 0;
-    haar.NumClassifierRootNodes = 0;
-    haar.NumClassifierTotalNodes = 0;
-    haar.NumFeatures = 0;
-    haar.ClassifierSize.width = 0;
-    haar.ClassifierSize.height = 0;
-    haar.bHasStumpsOnly = true;
-    haar.bNeedsTiltedII = false;
-    Ncv32u curMaxTreeDepth;
-
-    std::vector<char> xmlFileCont;
-
-    std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
-    haarStages.resize(0);
-    haarClassifierNodes.resize(0);
-    haarFeatures.resize(0);
-
-    Ptr<CvHaarClassifierCascade> oldCascade = (CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0);
-    if (oldCascade.empty())
-    {
-        return NCV_HAAR_XML_LOADING_EXCEPTION;
-    }
-
-    haar.ClassifierSize.width = oldCascade->orig_window_size.width;
-    haar.ClassifierSize.height = oldCascade->orig_window_size.height;
-
-    int stagesCound = oldCascade->count;
-    for(int s = 0; s < stagesCound; ++s) // by stages
-    {
-        HaarStage64 curStage;
-        curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
-
-        curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
-
-        int treesCount = oldCascade->stage_classifier[s].count;
-        for(int t = 0; t < treesCount; ++t) // by trees
-        {
-            Ncv32u nodeId = 0;
-            CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
-
-            int nodesCount = tree->count;
-            for(int n = 0; n < nodesCount; ++n)  //by features
-            {
-                CvHaarFeature* feature = &tree->haar_feature[n];
-
-                HaarClassifierNode128 curNode;
-                curNode.setThreshold(tree->threshold[n]);
-
-                NcvBool bIsLeftNodeLeaf = false;
-                NcvBool bIsRightNodeLeaf = false;
-
-                HaarClassifierNodeDescriptor32 nodeLeft;
-                if ( tree->left[n] <= 0 )
-                {
-                    Ncv32f leftVal = tree->alpha[-tree->left[n]];
-                    ncvStat = nodeLeft.create(leftVal);
-                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
-                    bIsLeftNodeLeaf = true;
-                }
-                else
-                {
-                    Ncv32u leftNodeOffset = tree->left[n];
-                    nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
-                    haar.bHasStumpsOnly = false;
-                }
-                curNode.setLeftNodeDesc(nodeLeft);
-
-                HaarClassifierNodeDescriptor32 nodeRight;
-                if ( tree->right[n] <= 0 )
-                {
-                    Ncv32f rightVal = tree->alpha[-tree->right[n]];
-                    ncvStat = nodeRight.create(rightVal);
-                    ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
-                    bIsRightNodeLeaf = true;
-                }
-                else
-                {
-                    Ncv32u rightNodeOffset = tree->right[n];
-                    nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
-                    haar.bHasStumpsOnly = false;
-                }
-                curNode.setRightNodeDesc(nodeRight);
-
-                Ncv32u tiltedVal = feature->tilted;
-                haar.bNeedsTiltedII = (tiltedVal != 0);
-
-                Ncv32u featureId = 0;
-                for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
-                {
-                    Ncv32u rectX = feature->rect[l].r.x;
-                    Ncv32u rectY = feature->rect[l].r.y;
-                    Ncv32u rectWidth = feature->rect[l].r.width;
-                    Ncv32u rectHeight = feature->rect[l].r.height;
-
-                    Ncv32f rectWeight = feature->rect[l].weight;
-
-                    if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
-                        break;
-
-                    HaarFeature64 curFeature;
-                    ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
-                    curFeature.setWeight(rectWeight);
-                    ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
-                    haarFeatures.push_back(curFeature);
-
-                    featureId++;
-                }
-
-                HaarFeatureDescriptor32 tmpFeatureDesc;
-                ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
-                    featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
-                ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
-                curNode.setFeatureDesc(tmpFeatureDesc);
-
-                if (!nodeId)
-                {
-                    //root node
-                    haarClassifierNodes.push_back(curNode);
-                    curMaxTreeDepth = 1;
-                }
-                else
-                {
-                    //other node
-                    h_TmpClassifierNotRootNodes.push_back(curNode);
-                    curMaxTreeDepth++;
-                }
-
-                nodeId++;
-            }
-        }
-
-        curStage.setNumClassifierRootNodes(treesCount);
-        haarStages.push_back(curStage);
-    }
-
-    //fill in cascade stats
-    haar.NumStages = static_cast<Ncv32u>(haarStages.size());
-    haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
-    haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
-    haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
-
-    //merge root and leaf nodes in one classifiers array
-    Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
-    for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
-    {
-        HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
-
-        HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
-        if (!featureDesc.isLeftNodeLeaf())
-        {
-            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
-            nodeLeft.create(newOffset);
-        }
-        haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
-
-        HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
-        if (!featureDesc.isRightNodeLeaf())
-        {
-            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
-            nodeRight.create(newOffset);
-        }
-        haarClassifierNodes[i].setRightNodeDesc(nodeRight);
-    }
-
-    for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
-    {
-        HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
-
-        HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
-        if (!featureDesc.isLeftNodeLeaf())
-        {
-            Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
-            nodeLeft.create(newOffset);
-        }
-        h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
-
-        HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
-        if (!featureDesc.isRightNodeLeaf())
-        {
-            Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
-            nodeRight.create(newOffset);
-        }
-        h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
-
-        haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
-    }
-
-    return NCV_SUCCESS;
-}
-
-#endif /* HAVE_CUDA */
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@@ -1,201 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-/*
- * NV12ToARGB color space conversion CUDA kernel
- *
- * This sample uses CUDA to perform a simple NV12 (YUV 4:2:0 planar)
- * source and converts to output in ARGB format
- */
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace video_decoding
-    {
-        __constant__ uint constAlpha = ((uint)0xff << 24);
-
-        __constant__ float constHueColorSpaceMat[9];
-
-        void loadHueCSC(float hueCSC[9])
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(constHueColorSpaceMat, hueCSC, 9 * sizeof(float)) );
-        }
-
-        __device__ void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
-        {
-            float luma, chromaCb, chromaCr;
-
-            // Prepare for hue adjustment
-            luma     = (float)yuvi[0];
-            chromaCb = (float)((int)yuvi[1] - 512.0f);
-            chromaCr = (float)((int)yuvi[2] - 512.0f);
-
-           // Convert YUV To RGB with hue adjustment
-           *red   = (luma     * constHueColorSpaceMat[0]) +
-                    (chromaCb * constHueColorSpaceMat[1]) +
-                    (chromaCr * constHueColorSpaceMat[2]);
-
-           *green = (luma     * constHueColorSpaceMat[3]) +
-                    (chromaCb * constHueColorSpaceMat[4]) +
-                    (chromaCr * constHueColorSpaceMat[5]);
-
-           *blue  = (luma     * constHueColorSpaceMat[6]) +
-                    (chromaCb * constHueColorSpaceMat[7]) +
-                    (chromaCr * constHueColorSpaceMat[8]);
-        }
-
-        __device__ uint RGBAPACK_10bit(float red, float green, float blue, uint alpha)
-        {
-            uint ARGBpixel = 0;
-
-            // Clamp final 10 bit results
-            red   = ::fmin(::fmax(red,   0.0f), 1023.f);
-            green = ::fmin(::fmax(green, 0.0f), 1023.f);
-            blue  = ::fmin(::fmax(blue,  0.0f), 1023.f);
-
-            // Convert to 8 bit unsigned integers per color component
-            ARGBpixel = (((uint)blue  >> 2) |
-                        (((uint)green >> 2) << 8)  |
-                        (((uint)red   >> 2) << 16) |
-                        (uint)alpha);
-
-            return ARGBpixel;
-        }
-
-        // CUDA kernel for outputing the final ARGB output from NV12
-
-        #define COLOR_COMPONENT_BIT_SIZE 10
-        #define COLOR_COMPONENT_MASK     0x3FF
-
-        __global__ void NV12ToARGB(uchar* srcImage, size_t nSourcePitch,
-                                   uint* dstImage, size_t nDestPitch,
-                                   uint width, uint height)
-        {
-            // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
-            const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
-            const int y = blockIdx.y *  blockDim.y       +  threadIdx.y;
-
-            if (x >= width || y >= height)
-                return;
-
-            // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
-            // if we move to texture we could read 4 luminance values
-
-            uint yuv101010Pel[2];
-
-            yuv101010Pel[0] = (srcImage[y * nSourcePitch + x    ]) << 2;
-            yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;
-
-            const size_t chromaOffset = nSourcePitch * height;
-
-            const int y_chroma = y >> 1;
-
-            if (y & 1)  // odd scanline ?
-            {
-                uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x    ];
-                uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];
-
-                if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
-                {
-                    chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x    ] + 1) >> 1;
-                    chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
-                }
-
-                yuv101010Pel[0] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-                yuv101010Pel[1] |= (chromaCb << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-            }
-            else
-            {
-                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-
-                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x    ] << ( COLOR_COMPONENT_BIT_SIZE       + 2));
-                yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
-            }
-
-            // this steps performs the color conversion
-            uint yuvi[6];
-            float red[2], green[2], blue[2];
-
-            yuvi[0] =  (yuv101010Pel[0] &   COLOR_COMPONENT_MASK    );
-            yuvi[1] = ((yuv101010Pel[0] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-            yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-            yuvi[3] =  (yuv101010Pel[1] &   COLOR_COMPONENT_MASK    );
-            yuvi[4] = ((yuv101010Pel[1] >>  COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
-            yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);
-
-            // YUV to RGB Transformation conversion
-            YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
-            YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);
-
-            // Clamp the results to RGBA
-
-            const size_t dstImagePitch = nDestPitch >> 2;
-
-            dstImage[y * dstImagePitch + x     ] = RGBAPACK_10bit(red[0], green[0], blue[0], constAlpha);
-            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
-        }
-
-        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));
-
-            NV12ToARGB<<<grid, block, 0, stream>>>(decodedFrame.data, decodedFrame.step, interopFrame.data, interopFrame.step,
-                interopFrame.cols, interopFrame.rows);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -1,774 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/vec_distance.hpp"
-#include "opencv2/core/cuda/datamov_utils.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace bf_match
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Reduction
-
-        template <int BLOCK_SIZE>
-        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
-        {
-            s_distance += threadIdx.y * BLOCK_SIZE;
-            s_trainIdx += threadIdx.y * BLOCK_SIZE;
-
-            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
-        }
-
-        template <int BLOCK_SIZE>
-        __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
-        {
-            s_distance += threadIdx.y * BLOCK_SIZE;
-            s_trainIdx += threadIdx.y * BLOCK_SIZE;
-            s_imgIdx   += threadIdx.y * BLOCK_SIZE;
-
-            reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match Unrolled Cached
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
-        __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
-        {
-            #pragma unroll
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-                s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
-                                           typename Dist::value_type* s_query, typename Dist::value_type* s_train,
-                                           float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
-        {
-            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-            {
-                Dist dist;
-
-                #pragma unroll
-                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-                {
-                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                    if (loadX < train.cols)
-                    {
-                        T val;
-
-                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                    }
-
-                    __syncthreads();
-
-                    #pragma unroll
-                    for (int j = 0; j < BLOCK_SIZE; ++j)
-                        dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                    __syncthreads();
-                }
-
-                typename Dist::result_type distVal = dist;
-
-                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
-
-                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
-                {
-                    bestImgIdx = imgIdx;
-                    bestDistance = distVal;
-                    bestTrainIdx = trainIdx;
-                }
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
-
-            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
-
-            float myBestDistance = numeric_limits<float>::max();
-            int myBestTrainIdx = -1;
-
-            loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
-
-            __syncthreads();
-
-            float* s_distance = (float*)(smem);
-            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
-
-            if (queryIdx < query.rows && threadIdx.x == 0)
-            {
-                bestTrainIdx[queryIdx] = myBestTrainIdx;
-                bestDistance[queryIdx] = myBestDistance;
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
-                                 const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                 cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
-                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
-
-            loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
-
-            float myBestDistance = numeric_limits<float>::max();
-            int myBestTrainIdx = -1;
-            int myBestImgIdx = -1;
-
-            Mask m = mask;
-
-            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-            {
-                const PtrStepSz<T> train = trains[imgIdx];
-                m.next();
-                loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
-            }
-
-            __syncthreads();
-
-            float* s_distance = (float*)(smem);
-            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-            int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
-
-            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
-
-            if (queryIdx < query.rows && threadIdx.x == 0)
-            {
-                bestTrainIdx[queryIdx] = myBestTrainIdx;
-                bestImgIdx[queryIdx] = myBestImgIdx;
-                bestDistance[queryIdx] = myBestDistance;
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
-                                 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                 cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match Unrolled
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
-                                     typename Dist::value_type* s_query, typename Dist::value_type* s_train,
-                                     float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
-        {
-            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-            {
-                Dist dist;
-
-                #pragma unroll
-                for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-                {
-                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                    if (loadX < query.cols)
-                    {
-                        T val;
-
-                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                    }
-
-                    __syncthreads();
-
-                    #pragma unroll
-                    for (int j = 0; j < BLOCK_SIZE; ++j)
-                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                    __syncthreads();
-                }
-
-                typename Dist::result_type distVal = dist;
-
-                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
-
-                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
-                {
-                    bestImgIdx = imgIdx;
-                    bestDistance = distVal;
-                    bestTrainIdx = trainIdx;
-                }
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-            float myBestDistance = numeric_limits<float>::max();
-            int myBestTrainIdx = -1;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
-
-            __syncthreads();
-
-            float* s_distance = (float*)(smem);
-            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
-
-            if (queryIdx < query.rows && threadIdx.x == 0)
-            {
-                bestTrainIdx[queryIdx] = myBestTrainIdx;
-                bestDistance[queryIdx] = myBestDistance;
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
-                           const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                           cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
-                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-            float myBestDistance = numeric_limits<float>::max();
-            int myBestTrainIdx = -1;
-            int myBestImgIdx = -1;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Mask m = mask;
-
-            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-            {
-                const PtrStepSz<T> train = trains[imgIdx];
-                m.next();
-                loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
-            }
-
-            __syncthreads();
-
-            float* s_distance = (float*)(smem);
-            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
-
-            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
-
-            if (queryIdx < query.rows && threadIdx.x == 0)
-            {
-                bestTrainIdx[queryIdx] = myBestTrainIdx;
-                bestImgIdx[queryIdx] = myBestImgIdx;
-                bestDistance[queryIdx] = myBestDistance;
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
-                           const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                           cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
-                             typename Dist::value_type* s_query, typename Dist::value_type* s_train,
-                             float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
-        {
-            for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
-            {
-                Dist dist;
-
-                for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-                {
-                    const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                    if (loadX < query.cols)
-                    {
-                        T val;
-
-                        ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                        ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                    }
-
-                    __syncthreads();
-
-                    #pragma unroll
-                    for (int j = 0; j < BLOCK_SIZE; ++j)
-                        dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                    __syncthreads();
-                }
-
-                typename Dist::result_type distVal = dist;
-
-                const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
-
-                if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
-                {
-                    bestImgIdx = imgIdx;
-                    bestDistance = distVal;
-                    bestTrainIdx = trainIdx;
-                }
-            }
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-            float myBestDistance = numeric_limits<float>::max();
-            int myBestTrainIdx = -1;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
-
-            __syncthreads();
-
-            float* s_distance = (float*)(smem);
-            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
-
-            if (queryIdx < query.rows && threadIdx.x == 0)
-            {
-                bestTrainIdx[queryIdx] = myBestTrainIdx;
-                bestDistance[queryIdx] = myBestDistance;
-            }
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
-                   const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                   cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
-                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
-
-            float myBestDistance = numeric_limits<float>::max();
-            int myBestTrainIdx = -1;
-            int myBestImgIdx = -1;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Mask m = mask;
-            for (int imgIdx = 0; imgIdx < n; ++imgIdx)
-            {
-                const PtrStepSz<T> train = trains[imgIdx];
-                m.next();
-                loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
-            }
-
-            __syncthreads();
-
-            float* s_distance = (float*)(smem);
-            int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-            int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
-
-            findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
-
-            if (queryIdx < query.rows && threadIdx.x == 0)
-            {
-                bestTrainIdx[queryIdx] = myBestTrainIdx;
-                bestImgIdx[queryIdx] = myBestImgIdx;
-                bestDistance[queryIdx] = myBestDistance;
-            }
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
-                   const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                   cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match dispatcher
-
-        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                             cudaStream_t stream)
-        {
-            if (query.cols <= 64)
-            {
-                matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, train, mask, trainIdx, distance, stream);
-            }
-        }
-
-        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                             cudaStream_t stream)
-        {
-            if (query.cols <= 64)
-            {
-                matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
-            }
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match caller
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
-                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
-                    trainIdx, distance,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
-                    trainIdx, distance,
-                    stream);
-            }
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
-                                               const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                               cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
-                    trainIdx, distance,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
-                    trainIdx, distance,
-                    stream);
-            }
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
-                                                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
-                                                    cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
-                    trainIdx, distance,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
-                    trainIdx, distance,
-                    stream);
-            }
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                                cudaStream_t stream)
-        {
-            if (masks.data)
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
-                    trainIdx, imgIdx, distance,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
-                    trainIdx, imgIdx, distance,
-                    stream);
-            }
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                                               const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                               cudaStream_t stream)
-        {
-            if (masks.data)
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
-                    trainIdx, imgIdx, distance,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
-                    trainIdx, imgIdx, distance,
-                    stream);
-            }
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
-                                                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
-                                                    cudaStream_t stream)
-        {
-            if (masks.data)
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
-                    trainIdx, imgIdx, distance,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
-                    trainIdx, imgIdx, distance,
-                    stream);
-            }
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
-    } // namespace bf_match
-}}} // namespace cv { namespace gpu { namespace cudev {
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -1,463 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/vec_distance.hpp"
-#include "opencv2/core/cuda/datamov_utils.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace bf_radius_match
-    {
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match Unrolled
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
-            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Dist dist;
-
-            #pragma unroll
-            for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                if (loadX < query.cols)
-                {
-                    T val;
-
-                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                }
-
-                __syncthreads();
-
-                #pragma unroll
-                for (int j = 0; j < BLOCK_SIZE; ++j)
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                __syncthreads();
-            }
-
-            float distVal = (typename Dist::result_type)dist;
-
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-            {
-                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                if (ind < maxCount)
-                {
-                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                    bestDistance.ptr(queryIdx)[ind] = distVal;
-                }
-            }
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
-                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
-        void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            for (int i = 0; i < n; ++i)
-            {
-                const PtrStepSz<T> train = trains[i];
-
-                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-                if (masks != 0 && masks[i].data)
-                {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                else
-                {
-                    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match
-
-        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
-            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
-        {
-            extern __shared__ int smem[];
-
-            const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-            const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-            typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-            typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-            Dist dist;
-
-            for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
-            {
-                const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-                if (loadX < query.cols)
-                {
-                    T val;
-
-                    ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
-                    s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                    ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                    s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-                }
-
-                __syncthreads();
-
-                #pragma unroll
-                for (int j = 0; j < BLOCK_SIZE; ++j)
-                    dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-                __syncthreads();
-            }
-
-            float distVal = (typename Dist::result_type)dist;
-
-            if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-            {
-                unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-                if (ind < maxCount)
-                {
-                    bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                    if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                    bestDistance.ptr(queryIdx)[ind] = distVal;
-                }
-            }
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
-                trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <int BLOCK_SIZE, typename Dist, typename T>
-        void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-            const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-            for (int i = 0; i < n; ++i)
-            {
-                const PtrStepSz<T> train = trains[i];
-
-                const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-                if (masks != 0 && masks[i].data)
-                {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                else
-                {
-                    match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
-                        trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-                }
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Match dispatcher
-
-        template <typename Dist, typename T, typename Mask>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, float maxDistance, const Mask& mask,
-                             const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             cudaStream_t stream)
-        {
-            if (query.cols <= 64)
-            {
-                matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-            }
-        }
-
-        template <typename Dist, typename T>
-        void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, float maxDistance, const PtrStepSzb* masks,
-                             const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-                             cudaStream_t stream)
-        {
-            if (query.cols <= 64)
-            {
-                matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 128)
-            {
-                matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            /*else if (query.cols <= 256)
-            {
-                matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 512)
-            {
-                matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-            else if (query.cols <= 1024)
-            {
-                matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }*/
-            else
-            {
-                match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-            }
-        }
-
-        ///////////////////////////////////////////////////////////////////////////////
-        // Radius Match caller
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    stream);
-            }
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    stream);
-            }
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, float maxDistance, const PtrStepSzb& mask,
-            const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            if (mask.data)
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, SingleMask(mask),
-                    trainIdx, distance, nMatches,
-                    stream);
-            }
-            else
-            {
-                matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), maxDistance, WithOutMask(),
-                    trainIdx, distance, nMatches,
-                    stream);
-            }
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, float maxDistance, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-
-        template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                stream);
-        }
-
-        template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-
-        template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                stream);
-        }
-
-        //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-
-        template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks,
-            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches,
-            cudaStream_t stream)
-        {
-            matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains, n, maxDistance, masks,
-                trainIdx, imgIdx, distance, nMatches,
-                stream);
-        }
-
-        template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-        template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb* trains, int n, float maxDistance, const PtrStepSzb* masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, const PtrStepSz<unsigned int>& nMatches, cudaStream_t stream);
-    } // namespace bf_radius_match
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@@ -1,258 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev {
-    namespace bgfg_gmg
-    {
-        __constant__ int   c_width;
-        __constant__ int   c_height;
-        __constant__ float c_minVal;
-        __constant__ float c_maxVal;
-        __constant__ int   c_quantizationLevels;
-        __constant__ float c_backgroundPrior;
-        __constant__ float c_decisionThreshold;
-        __constant__ int   c_maxFeatures;
-        __constant__ int   c_numInitializationFrames;
-
-        void loadConstants(int width, int height, float minVal, float maxVal, int quantizationLevels, float backgroundPrior,
-                           float decisionThreshold, int maxFeatures, int numInitializationFrames)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_width, &width, sizeof(width)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_height, &height, sizeof(height)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_minVal, &minVal, sizeof(minVal)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_maxVal, &maxVal, sizeof(maxVal)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_quantizationLevels, &quantizationLevels, sizeof(quantizationLevels)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_backgroundPrior, &backgroundPrior, sizeof(backgroundPrior)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_decisionThreshold, &decisionThreshold, sizeof(decisionThreshold)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_maxFeatures, &maxFeatures, sizeof(maxFeatures)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_numInitializationFrames, &numInitializationFrames, sizeof(numInitializationFrames)) );
-        }
-
-        __device__ float findFeature(const int color, const PtrStepi& colors, const PtrStepf& weights, const int x, const int y, const int nfeatures)
-        {
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-            {
-                if (color == colors(fy, x))
-                    return weights(fy, x);
-            }
-
-            // not in histogram, so return 0.
-            return 0.0f;
-        }
-
-        __device__ void normalizeHistogram(PtrStepf weights, const int x, const int y, const int nfeatures)
-        {
-            float total = 0.0f;
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                total += weights(fy, x);
-
-            if (total != 0.0f)
-            {
-                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                    weights(fy, x) /= total;
-            }
-        }
-
-        __device__ bool insertFeature(const int color, const float weight, PtrStepi colors, PtrStepf weights, const int x, const int y, int& nfeatures)
-        {
-            for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-            {
-                if (color == colors(fy, x))
-                {
-                    // feature in histogram
-
-                    weights(fy, x) += weight;
-
-                    return false;
-                }
-            }
-
-            if (nfeatures == c_maxFeatures)
-            {
-                // discard oldest feature
-
-                int idx = -1;
-                float minVal = numeric_limits<float>::max();
-                for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                {
-                    const float w = weights(fy, x);
-                    if (w < minVal)
-                    {
-                        minVal = w;
-                        idx = fy;
-                    }
-                }
-
-                colors(idx, x) = color;
-                weights(idx, x) = weight;
-
-                return false;
-            }
-
-            colors(nfeatures * c_height + y, x) = color;
-            weights(nfeatures * c_height + y, x) = weight;
-
-            ++nfeatures;
-
-            return true;
-        }
-
-        namespace detail
-        {
-            template <int cn> struct Quantization
-            {
-                template <typename T>
-                __device__ static int apply(const T& val)
-                {
-                    int res = 0;
-                    res |= static_cast<int>((val.x - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
-                    res |= static_cast<int>((val.y - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 8;
-                    res |= static_cast<int>((val.z - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal)) << 16;
-                    return res;
-                }
-            };
-
-            template <> struct Quantization<1>
-            {
-                template <typename T>
-                __device__ static int apply(T val)
-                {
-                    return static_cast<int>((val - c_minVal) * c_quantizationLevels / (c_maxVal - c_minVal));
-                }
-            };
-        }
-
-        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};
-
-        template <typename SrcT>
-        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
-                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= c_width || y >= c_height)
-                return;
-
-            const SrcT pix = frame(y, x);
-            const int newFeatureColor = Quantization<SrcT>::apply(pix);
-
-            int nfeatures = nfeatures_(y, x);
-
-            if (frameNum >= c_numInitializationFrames)
-            {
-                // typical operation
-
-                const float weight = findFeature(newFeatureColor, colors_, weights_, x, y, nfeatures);
-
-                // see Godbehere, Matsukawa, Goldberg (2012) for reasoning behind this implementation of Bayes rule
-                const float posterior = (weight * c_backgroundPrior) / (weight * c_backgroundPrior + (1.0f - weight) * (1.0f - c_backgroundPrior));
-
-                const bool isForeground = ((1.0f - posterior) > c_decisionThreshold);
-                fgmask(y, x) = (uchar)(-isForeground);
-
-                // update histogram.
-
-                if (updateBackgroundModel)
-                {
-                    for (int i = 0, fy = y; i < nfeatures; ++i, fy += c_height)
-                        weights_(fy, x) *= 1.0f - learningRate;
-
-                    bool inserted = insertFeature(newFeatureColor, learningRate, colors_, weights_, x, y, nfeatures);
-
-                    if (inserted)
-                    {
-                        normalizeHistogram(weights_, x, y, nfeatures);
-                        nfeatures_(y, x) = nfeatures;
-                    }
-                }
-            }
-            else if (updateBackgroundModel)
-            {
-                // training-mode update
-
-                insertFeature(newFeatureColor, 1.0f, colors_, weights_, x, y, nfeatures);
-
-                if (frameNum == c_numInitializationFrames - 1)
-                    normalizeHistogram(weights_, x, y, nfeatures);
-            }
-        }
-
-        template <typename SrcT>
-        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
-                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
-        {
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );
-
-            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-
-        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-
-        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
@@ -1,764 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mog
-    {
-        ///////////////////////////////////////////////////////////////
-        // Utility
-
-        __device__ __forceinline__ float cvt(uchar val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ float3 cvt(const uchar3& val)
-        {
-            return make_float3(val.x, val.y, val.z);
-        }
-        __device__ __forceinline__ float4 cvt(const uchar4& val)
-        {
-            return make_float4(val.x, val.y, val.z, val.w);
-        }
-
-        __device__ __forceinline__ float sqr(float val)
-        {
-            return val * val;
-        }
-        __device__ __forceinline__ float sqr(const float3& val)
-        {
-            return val.x * val.x + val.y * val.y + val.z * val.z;
-        }
-        __device__ __forceinline__ float sqr(const float4& val)
-        {
-            return val.x * val.x + val.y * val.y + val.z * val.z;
-        }
-
-        __device__ __forceinline__ float sum(float val)
-        {
-            return val;
-        }
-        __device__ __forceinline__ float sum(const float3& val)
-        {
-            return val.x + val.y + val.z;
-        }
-        __device__ __forceinline__ float sum(const float4& val)
-        {
-            return val.x + val.y + val.z;
-        }
-
-        __device__ __forceinline__ float clamp(float var, float learningRate, float diff, float minVar)
-        {
-             return ::fmaxf(var + learningRate * (diff * diff - var), minVar);
-        }
-        __device__ __forceinline__ float3 clamp(const float3& var, float learningRate, const float3& diff, float minVar)
-        {
-             return make_float3(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
-                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
-                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar));
-        }
-        __device__ __forceinline__ float4 clamp(const float4& var, float learningRate, const float4& diff, float minVar)
-        {
-             return make_float4(::fmaxf(var.x + learningRate * (diff.x * diff.x - var.x), minVar),
-                                ::fmaxf(var.y + learningRate * (diff.y * diff.y - var.y), minVar),
-                                ::fmaxf(var.z + learningRate * (diff.z * diff.z - var.z), minVar),
-                                0.0f);
-        }
-
-        template <class Ptr2D>
-        __device__ __forceinline__ void swap(Ptr2D& ptr, int x, int y, int k, int rows)
-        {
-            typename Ptr2D::elem_type val = ptr(k * rows + y, x);
-            ptr(k * rows + y, x) = ptr((k + 1) * rows + y, x);
-            ptr((k + 1) * rows + y, x) = val;
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG without learning
-
-        template <typename SrcT, typename WorkT>
-        __global__ void mog_withoutLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
-                                            const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, const PtrStep<WorkT> gmm_var,
-                                            const int nmixtures, const float varThreshold, const float backgroundRatio)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            int kHit = -1;
-            int kForeground = -1;
-
-            for (int k = 0; k < nmixtures; ++k)
-            {
-                if (gmm_weight(k * frame.rows + y, x) < numeric_limits<float>::epsilon())
-                    break;
-
-                WorkT mu = gmm_mean(k * frame.rows + y, x);
-                WorkT var = gmm_var(k * frame.rows + y, x);
-
-                WorkT diff = pix - mu;
-
-                if (sqr(diff) < varThreshold * sum(var))
-                {
-                    kHit = k;
-                    break;
-                }
-            }
-
-            if (kHit >= 0)
-            {
-                float wsum = 0.0f;
-                for (int k = 0; k < nmixtures; ++k)
-                {
-                    wsum += gmm_weight(k * frame.rows + y, x);
-
-                    if (wsum > backgroundRatio)
-                    {
-                        kForeground = k + 1;
-                        break;
-                    }
-                }
-            }
-
-            fgmask(y, x) = (uchar) (-(kHit < 0 || kHit >= kForeground));
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog_withoutLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var,
-                                        int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(mog_withoutLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-            mog_withoutLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
-                                                                         weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
-                                                                         nmixtures, varThreshold, backgroundRatio);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG with learning
-
-        template <typename SrcT, typename WorkT>
-        __global__ void mog_withLearning(const PtrStepSz<SrcT> frame, PtrStepb fgmask,
-                                         PtrStepf gmm_weight, PtrStepf gmm_sortKey, PtrStep<WorkT> gmm_mean, PtrStep<WorkT> gmm_var,
-                                         const int nmixtures, const float varThreshold, const float backgroundRatio, const float learningRate, const float minVar)
-        {
-            const float w0 = 0.05f;
-            const float sk0 = w0 / (30.0f * 0.5f * 2.0f);
-            const float var0 = 30.0f * 0.5f * 30.0f * 0.5f * 4.0f;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            float wsum = 0.0f;
-            int kHit = -1;
-            int kForeground = -1;
-
-            int k = 0;
-            for (; k < nmixtures; ++k)
-            {
-                float w = gmm_weight(k * frame.rows + y, x);
-                wsum += w;
-
-                if (w < numeric_limits<float>::epsilon())
-                    break;
-
-                WorkT mu = gmm_mean(k * frame.rows + y, x);
-                WorkT var = gmm_var(k * frame.rows + y, x);
-
-                WorkT diff = pix - mu;
-
-                if (sqr(diff) < varThreshold * sum(var))
-                {
-                    wsum -= w;
-                    float dw = learningRate * (1.0f - w);
-
-                    var = clamp(var, learningRate, diff, minVar);
-
-                    float sortKey_prev = w / ::sqrtf(sum(var));
-                    gmm_sortKey(k * frame.rows + y, x) = sortKey_prev;
-
-                    float weight_prev = w + dw;
-                    gmm_weight(k * frame.rows + y, x) = weight_prev;
-
-                    WorkT mean_prev = mu + learningRate * diff;
-                    gmm_mean(k * frame.rows + y, x) = mean_prev;
-
-                    WorkT var_prev = var;
-                    gmm_var(k * frame.rows + y, x) = var_prev;
-
-                    int k1 = k - 1;
-
-                    if (k1 >= 0)
-                    {
-                        float sortKey_next = gmm_sortKey(k1 * frame.rows + y, x);
-                        float weight_next = gmm_weight(k1 * frame.rows + y, x);
-                        WorkT mean_next = gmm_mean(k1 * frame.rows + y, x);
-                        WorkT var_next = gmm_var(k1 * frame.rows + y, x);
-
-                        for (; sortKey_next < sortKey_prev && k1 >= 0; --k1)
-                        {
-                            gmm_sortKey(k1 * frame.rows + y, x) = sortKey_prev;
-                            gmm_sortKey((k1 + 1) * frame.rows + y, x) = sortKey_next;
-
-                            gmm_weight(k1 * frame.rows + y, x) = weight_prev;
-                            gmm_weight((k1 + 1) * frame.rows + y, x) = weight_next;
-
-                            gmm_mean(k1 * frame.rows + y, x) = mean_prev;
-                            gmm_mean((k1 + 1) * frame.rows + y, x) = mean_next;
-
-                            gmm_var(k1 * frame.rows + y, x) = var_prev;
-                            gmm_var((k1 + 1) * frame.rows + y, x) = var_next;
-
-                            sortKey_prev = sortKey_next;
-                            sortKey_next = k1 > 0 ? gmm_sortKey((k1 - 1) * frame.rows + y, x) : 0.0f;
-
-                            weight_prev = weight_next;
-                            weight_next = k1 > 0 ? gmm_weight((k1 - 1) * frame.rows + y, x) : 0.0f;
-
-                            mean_prev = mean_next;
-                            mean_next = k1 > 0 ? gmm_mean((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
-
-                            var_prev = var_next;
-                            var_next = k1 > 0 ? gmm_var((k1 - 1) * frame.rows + y, x) : VecTraits<WorkT>::all(0.0f);
-                        }
-                    }
-
-                    kHit = k1 + 1;
-                    break;
-                }
-            }
-
-            if (kHit < 0)
-            {
-                // no appropriate gaussian mixture found at all, remove the weakest mixture and create a new one
-                kHit = k = ::min(k, nmixtures - 1);
-                wsum += w0 - gmm_weight(k * frame.rows + y, x);
-
-                gmm_weight(k * frame.rows + y, x) = w0;
-                gmm_mean(k * frame.rows + y, x) = pix;
-                gmm_var(k * frame.rows + y, x) = VecTraits<WorkT>::all(var0);
-                gmm_sortKey(k * frame.rows + y, x) = sk0;
-            }
-            else
-            {
-                for( ; k < nmixtures; k++)
-                    wsum += gmm_weight(k * frame.rows + y, x);
-            }
-
-            float wscale = 1.0f / wsum;
-            wsum = 0;
-            for (k = 0; k < nmixtures; ++k)
-            {
-                float w = gmm_weight(k * frame.rows + y, x);
-                wsum += w *= wscale;
-
-                gmm_weight(k * frame.rows + y, x) = w;
-                gmm_sortKey(k * frame.rows + y, x) *= wscale;
-
-                if (wsum > backgroundRatio && kForeground < 0)
-                    kForeground = k + 1;
-            }
-
-            fgmask(y, x) = (uchar)(-(kHit >= kForeground));
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog_withLearning_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
-                                     int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar,
-                                     cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(mog_withLearning<SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-            mog_withLearning<SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask,
-                                                                      weight, sortKey, (PtrStepSz<WorkT>) mean, (PtrStepSz<WorkT>) var,
-                                                                      nmixtures, varThreshold, backgroundRatio, learningRate, minVar);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG
-
-        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma, cudaStream_t stream)
-        {
-            typedef void (*withoutLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, cudaStream_t stream);
-            typedef void (*withLearning_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var, int nmixtures, float varThreshold, float backgroundRatio, float learningRate, float minVar, cudaStream_t stream);
-
-            static const withoutLearning_t withoutLearning[] =
-            {
-                0, mog_withoutLearning_caller<uchar, float>, 0, mog_withoutLearning_caller<uchar3, float3>, mog_withoutLearning_caller<uchar4, float4>
-            };
-            static const withLearning_t withLearning[] =
-            {
-                0, mog_withLearning_caller<uchar, float>, 0, mog_withLearning_caller<uchar3, float3>, mog_withLearning_caller<uchar4, float4>
-            };
-
-            const float minVar = noiseSigma * noiseSigma;
-
-            if (learningRate > 0.0f)
-                withLearning[cn](frame, fgmask, weight, sortKey, mean, var, nmixtures, varThreshold, backgroundRatio, learningRate, minVar, stream);
-            else
-                withoutLearning[cn](frame, fgmask, weight, mean, var, nmixtures, varThreshold, backgroundRatio, stream);
-        }
-
-        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage(const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStepSz<OutT> dst, const int nmixtures, const float backgroundRatio)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= dst.cols || y >= dst.rows)
-                return;
-
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-            float totalWeight = 0.0f;
-
-            for (int mode = 0; mode < nmixtures; ++mode)
-            {
-                float weight = gmm_weight(mode * dst.rows + y, x);
-
-                WorkT mean = gmm_mean(mode * dst.rows + y, x);
-                meanVal = meanVal + weight * mean;
-
-                totalWeight += weight;
-
-                if(totalWeight > backgroundRatio)
-                    break;
-            }
-
-            meanVal = meanVal * (1.f / totalWeight);
-
-            dst(y, x) = saturate_cast<OutT>(meanVal);
-        }
-
-        template <typename WorkT, typename OutT>
-        void getBackgroundImage_caller(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage<WorkT, OutT>, cudaFuncCachePreferL1) );
-
-            getBackgroundImage<WorkT, OutT><<<grid, block, 0, stream>>>(weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst, nmixtures, backgroundRatio);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, getBackgroundImage_caller<float, uchar>, 0, getBackgroundImage_caller<float3, uchar3>, getBackgroundImage_caller<float4, uchar4>
-            };
-
-            funcs[cn](weight, mean, dst, nmixtures, backgroundRatio, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////
-        // MOG2
-
-        __constant__ int           c_nmixtures;
-        __constant__ float         c_Tb;
-        __constant__ float         c_TB;
-        __constant__ float         c_Tg;
-        __constant__ float         c_varInit;
-        __constant__ float         c_varMin;
-        __constant__ float         c_varMax;
-        __constant__ float         c_tau;
-        __constant__ unsigned char c_shadowVal;
-
-        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal)
-        {
-            varMin = ::fminf(varMin, varMax);
-            varMax = ::fmaxf(varMin, varMax);
-
-            cudaSafeCall( cudaMemcpyToSymbol(c_nmixtures, &nmixtures, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tb, &Tb, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_TB, &TB, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_Tg, &Tg, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varInit, &varInit, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMin, &varMin, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_varMax, &varMax, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_tau, &tau, sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_shadowVal, &shadowVal, sizeof(unsigned char)) );
-        }
-
-        template <bool detectShadows, typename SrcT, typename WorkT>
-        __global__ void mog2(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStepb modesUsed,
-                             PtrStepf gmm_weight, PtrStepf gmm_variance, PtrStep<WorkT> gmm_mean,
-                             const float alphaT, const float alpha1, const float prune)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= frame.cols || y >= frame.rows)
-                return;
-
-            WorkT pix = cvt(frame(y, x));
-
-            //calculate distances to the modes (+ sort)
-            //here we need to go in descending order!!!
-
-            bool background = false; // true - the pixel classified as background
-
-            //internal:
-
-            bool fitsPDF = false; //if it remains zero a new GMM mode will be added
-
-            int nmodes = modesUsed(y, x);
-            int nNewModes = nmodes; //current number of modes in GMM
-
-            float totalWeight = 0.0f;
-
-            //go through all modes
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                //need only weight if fit is found
-                float weight = alpha1 * gmm_weight(mode * frame.rows + y, x) + prune;
-
-                //fit not found yet
-                if (!fitsPDF)
-                {
-                    //check if it belongs to some of the remaining modes
-                    float var = gmm_variance(mode * frame.rows + y, x);
-
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-
-                    //calculate difference and distance
-                    WorkT diff = mean - pix;
-                    float dist2 = sqr(diff);
-
-                    //background? - Tb - usually larger than Tg
-                    if (totalWeight < c_TB && dist2 < c_Tb * var)
-                        background = true;
-
-                    //check fit
-                    if (dist2 < c_Tg * var)
-                    {
-                        //belongs to the mode
-                        fitsPDF = true;
-
-                        //update distribution
-
-                        //update weight
-                        weight += alphaT;
-                        float k = alphaT / weight;
-
-                        //update mean
-                        gmm_mean(mode * frame.rows + y, x) = mean - k * diff;
-
-                        //update variance
-                        float varnew = var + k * (dist2 - var);
-
-                        //limit the variance
-                        varnew = ::fmaxf(varnew, c_varMin);
-                        varnew = ::fminf(varnew, c_varMax);
-
-                        gmm_variance(mode * frame.rows + y, x) = varnew;
-
-                        //sort
-                        //all other weights are at the same place and
-                        //only the matched (iModes) is higher -> just find the new place for it
-
-                        for (int i = mode; i > 0; --i)
-                        {
-                            //check one up
-                            if (weight < gmm_weight((i - 1) * frame.rows + y, x))
-                                break;
-
-                            //swap one up
-                            swap(gmm_weight, x, y, i - 1, frame.rows);
-                            swap(gmm_variance, x, y, i - 1, frame.rows);
-                            swap(gmm_mean, x, y, i - 1, frame.rows);
-                        }
-
-                        //belongs to the mode - bFitsPDF becomes 1
-                    }
-                } // !fitsPDF
-
-                //check prune
-                if (weight < -prune)
-                {
-                    weight = 0.0;
-                    nmodes--;
-                }
-
-                gmm_weight(mode * frame.rows + y, x) = weight; //update weight by the calculated value
-                totalWeight += weight;
-            }
-
-            //renormalize weights
-
-            totalWeight = 1.f / totalWeight;
-            for (int mode = 0; mode < nmodes; ++mode)
-                gmm_weight(mode * frame.rows + y, x) *= totalWeight;
-
-            nmodes = nNewModes;
-
-            //make new mode if needed and exit
-
-            if (!fitsPDF)
-            {
-                // replace the weakest or add a new one
-                int mode = nmodes == c_nmixtures ? c_nmixtures - 1 : nmodes++;
-
-                if (nmodes == 1)
-                    gmm_weight(mode * frame.rows + y, x) = 1.f;
-                else
-                {
-                    gmm_weight(mode * frame.rows + y, x) = alphaT;
-
-                    // renormalize all other weights
-
-                    for (int i = 0; i < nmodes - 1; ++i)
-                        gmm_weight(i * frame.rows + y, x) *= alpha1;
-                }
-
-                // init
-
-                gmm_mean(mode * frame.rows + y, x) = pix;
-                gmm_variance(mode * frame.rows + y, x) = c_varInit;
-
-                //sort
-                //find the new place for it
-
-                for (int i = nmodes - 1; i > 0; --i)
-                {
-                    // check one up
-                    if (alphaT < gmm_weight((i - 1) * frame.rows + y, x))
-                        break;
-
-                    //swap one up
-                    swap(gmm_weight, x, y, i - 1, frame.rows);
-                    swap(gmm_variance, x, y, i - 1, frame.rows);
-                    swap(gmm_mean, x, y, i - 1, frame.rows);
-                }
-            }
-
-            //set the number of modes
-            modesUsed(y, x) = nmodes;
-
-            bool isShadow = false;
-            if (detectShadows && !background)
-            {
-                float tWeight = 0.0f;
-
-                // check all the components  marked as background:
-                for (int mode = 0; mode < nmodes; ++mode)
-                {
-                    WorkT mean = gmm_mean(mode * frame.rows + y, x);
-
-                    WorkT pix_mean = pix * mean;
-
-                    float numerator = sum(pix_mean);
-                    float denominator = sqr(mean);
-
-                    // no division by zero allowed
-                    if (denominator == 0)
-                        break;
-
-                    // if tau < a < 1 then also check the color distortion
-                    if (numerator <= denominator && numerator >= c_tau * denominator)
-                    {
-                        float a = numerator / denominator;
-
-                        WorkT dD = a * mean - pix;
-
-                        if (sqr(dD) < c_Tb * gmm_variance(mode * frame.rows + y, x) * a * a)
-                        {
-                            isShadow = true;
-                            break;
-                        }
-                    };
-
-                    tWeight += gmm_weight(mode * frame.rows + y, x);
-                    if (tWeight > c_TB)
-                        break;
-                }
-            }
-
-            fgmask(y, x) = background ? 0 : isShadow ? c_shadowVal : 255;
-        }
-
-        template <typename SrcT, typename WorkT>
-        void mog2_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                         float alphaT, float prune, bool detectShadows, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
-
-            const float alpha1 = 1.0f - alphaT;
-
-            if (detectShadows)
-            {
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<true, SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-                mog2<true, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
-                                                                    alphaT, alpha1, prune);
-            }
-            else
-            {
-                cudaSafeCall( cudaFuncSetCacheConfig(mog2<false, SrcT, WorkT>, cudaFuncCachePreferL1) );
-
-                mog2<false, SrcT, WorkT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, modesUsed,
-                                                                    weight, variance, (PtrStepSz<WorkT>) mean,
-                                                                    alphaT, alpha1, prune);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean,
-                      float alphaT, float prune, bool detectShadows, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, mog2_caller<uchar, float>, 0, mog2_caller<uchar3, float3>, mog2_caller<uchar4, float4>
-            };
-
-            funcs[cn](frame, fgmask, modesUsed, weight, variance, mean, alphaT, prune, detectShadows, stream);
-        }
-
-        template <typename WorkT, typename OutT>
-        __global__ void getBackgroundImage2(const PtrStepSzb modesUsed, const PtrStepf gmm_weight, const PtrStep<WorkT> gmm_mean, PtrStep<OutT> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= modesUsed.cols || y >= modesUsed.rows)
-                return;
-
-            int nmodes = modesUsed(y, x);
-
-            WorkT meanVal = VecTraits<WorkT>::all(0.0f);
-            float totalWeight = 0.0f;
-
-            for (int mode = 0; mode < nmodes; ++mode)
-            {
-                float weight = gmm_weight(mode * modesUsed.rows + y, x);
-
-                WorkT mean = gmm_mean(mode * modesUsed.rows + y, x);
-                meanVal = meanVal + weight * mean;
-
-                totalWeight += weight;
-
-                if(totalWeight > c_TB)
-                    break;
-            }
-
-            meanVal = meanVal * (1.f / totalWeight);
-
-            dst(y, x) = saturate_cast<OutT>(meanVal);
-        }
-
-        template <typename WorkT, typename OutT>
-        void getBackgroundImage2_caller(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(modesUsed.cols, block.x), divUp(modesUsed.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(getBackgroundImage2<WorkT, OutT>, cudaFuncCachePreferL1) );
-
-            getBackgroundImage2<WorkT, OutT><<<grid, block, 0, stream>>>(modesUsed, weight, (PtrStepSz<WorkT>) mean, (PtrStepSz<OutT>) dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream)
-        {
-            typedef void (*func_t)(PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
-
-            static const func_t funcs[] =
-            {
-                0, getBackgroundImage2_caller<float, uchar>, 0, getBackgroundImage2_caller<float3, uchar3>, getBackgroundImage2_caller<float4, uchar4>
-            };
-
-            funcs[cn](modesUsed, weight, mean, dst, stream);
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -1,199 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-/// Bilateral filtering
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
-        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
-        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
-        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
-
-        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
-
-        template<typename T, typename B>
-        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x >= src.cols || y >= src.rows)
-                return;
-
-            value_type center = saturate_cast<value_type>(src(y, x));
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0;
-
-            int r = ksz / 2;
-            float r2 = (float)(r * r);
-
-            int tx = x - r + ksz;
-            int ty = y - r + ksz;
-
-            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(src(cy, cx));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            else
-            {
-                for (int cy = y - r; cy < ty; ++cy)
-                    for (int cx = x - r; cx < tx; ++cx)
-                    {
-                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
-                        if (space2 > r2)
-                            continue;
-
-                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
-
-                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
-
-                        sum1 = sum1 + weight * value;
-                        sum2 = sum2 + weight;
-                    }
-            }
-            dst(y, x) = saturate_cast<T>(sum1 / sum2);
-        }
-
-        template<typename T, template <typename> class B>
-        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
-             float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
-
-            static caller_t funcs[] =
-            {
-                bilateral_caller<T, BrdReflect101>,
-                bilateral_caller<T, BrdReplicate>,
-                bilateral_caller<T, BrdConstant>,
-                bilateral_caller<T, BrdReflect>,
-                bilateral_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
-        }
-    }
-}}}
-
-
-#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
-    template void cv::gpu::cudev::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
-
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
-OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(short)
-//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
-OCV_INSTANTIATE_BILATERAL_FILTER(short3)
-OCV_INSTANTIATE_BILATERAL_FILTER(short4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
-//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
-OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
-
-//OCV_INSTANTIATE_BILATERAL_FILTER(int)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
-//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
-
-OCV_INSTANTIATE_BILATERAL_FILTER(float)
-//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
-OCV_INSTANTIATE_BILATERAL_FILTER(float3)
-OCV_INSTANTIATE_BILATERAL_FILTER(float4)
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@@ -1,121 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace blend
-    {
-        template <typename T>
-        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                int x_ = x / cn;
-                float w1 = weights1.ptr(y)[x_];
-                float w2 = weights2.ptr(y)[x_];
-                T p1 = img1.ptr(y)[x];
-                T p2 = img2.ptr(y)[x];
-                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-            }
-        }
-
-        template <typename T>
-        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
-        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
-
-
-        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < rows && x < cols)
-            {
-                float w1 = weights1.ptr(y)[x];
-                float w2 = weights2.ptr(y)[x];
-                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-                w1 *= sum_inv;
-                w2 *= sum_inv;
-                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-            }
-        }
-
-        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
-        {
-            dim3 threads(16, 16);
-            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-
-            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-    } // namespace blend
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -1,494 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <utility>
-#include <algorithm>//std::swap
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace canny
-{
-    struct L1 : binary_function<int, int, float>
-    {
-        __device__ __forceinline__ float operator ()(int x, int y) const
-        {
-            return ::abs(x) + ::abs(y);
-        }
-
-        __host__ __device__ __forceinline__ L1() {}
-        __host__ __device__ __forceinline__ L1(const L1&) {}
-    };
-    struct L2 : binary_function<int, int, float>
-    {
-        __device__ __forceinline__ float operator ()(int x, int y) const
-        {
-            return ::sqrtf(x * x + y * y);
-        }
-
-        __host__ __device__ __forceinline__ L2() {}
-        __host__ __device__ __forceinline__ L2(const L2&) {}
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace canny
-{
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTex
-    {
-        const int xoff;
-        const int yoff;
-        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
-
-        __device__ __forceinline__ int operator ()(int y, int x) const
-        {
-            return tex2D(tex_src, x + xoff, y + yoff);
-        }
-    };
-
-    template <class Norm> __global__
-    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y >= mag.rows || x >= mag.cols)
-            return;
-
-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
-
-        dx(y, x) = dxVal;
-        dy(y, x) = dyVal;
-
-        mag(y, x) = norm(dxVal, dyVal);
-    }
-
-    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
-
-        bindTexture(&tex_src, srcWhole);
-        SrcTex src(xoff, yoff);
-
-        if (L2Grad)
-        {
-            L2 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
-        }
-        else
-        {
-            L1 norm;
-            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
-        }
-
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall(cudaThreadSynchronize());
-    }
-
-    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
-    {
-        if (L2Grad)
-        {
-            L2 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
-        }
-        else
-        {
-            L1 norm;
-            transform(dx, dy, mag, norm, WithOutMask(), 0);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
-    {
-        const int CANNY_SHIFT = 15;
-        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
-            return;
-
-        int dxVal = dx(y, x);
-        int dyVal = dy(y, x);
-
-        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D(tex_mag, x, y);
-
-        dxVal = ::abs(dxVal);
-        dyVal = ::abs(dyVal);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-
-        if (m > low_thresh)
-        {
-            const int tg22x = dxVal * TG22;
-            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
-
-            dyVal <<= CANNY_SHIFT;
-
-            if (dyVal < tg22x)
-            {
-                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if(dyVal > tg67x)
-            {
-                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-        }
-
-        map(y, x) = edge_type;
-    }
-
-    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
-    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
-
-        bindTexture(&tex_mag, mag);
-
-        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    __device__ int counter = 0;
-
-    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
-    {
-        __shared__ volatile int smem[18][18];
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
-        if (threadIdx.y == 0)
-            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
-        if (threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
-        if (threadIdx.x == 0)
-            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1)
-            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
-        if (threadIdx.x == 0 && threadIdx.y == 0)
-            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
-            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
-        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
-        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
-            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
-
-        __syncthreads();
-
-        if (x >= map.cols || y >= map.rows)
-            return;
-
-        int n;
-
-        #pragma unroll
-        for (int k = 0; k < 16; ++k)
-        {
-            n = 0;
-
-            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-            {
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-            }
-
-            if (n > 0)
-                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-        }
-
-        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-        map(y, x) = e;
-
-        n = 0;
-
-        if (e == 2)
-        {
-            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
-
-            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
-
-            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
-        }
-
-        if (n > 0)
-        {
-            const int ind =  ::atomicAdd(&counter, 1);
-            st[ind] = make_ushort2(x, y);
-        }
-    }
-
-    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
-    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
-
-        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
-
-        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
-    {
-        const int stack_size = 512;
-
-        __shared__ int s_counter;
-        __shared__ int s_ind;
-        __shared__ ushort2 s_st[stack_size];
-
-        if (threadIdx.x == 0)
-            s_counter = 0;
-
-        __syncthreads();
-
-        int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-        if (ind >= count)
-            return;
-
-        ushort2 pos = st1[ind];
-
-        if (threadIdx.x < 8)
-        {
-            pos.x += c_dx[threadIdx.x];
-            pos.y += c_dy[threadIdx.x];
-
-            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
-            {
-                map(pos.y, pos.x) = 2;
-
-                ind = Emulation::smem::atomicAdd(&s_counter, 1);
-
-                s_st[ind] = pos;
-            }
-        }
-
-        __syncthreads();
-
-        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
-        {
-            const int subTaskIdx = threadIdx.x >> 3;
-            const int portion = ::min(s_counter, blockDim.x >> 3);
-
-            if (subTaskIdx < portion)
-                pos = s_st[s_counter - 1 - subTaskIdx];
-
-            __syncthreads();
-
-            if (threadIdx.x == 0)
-                s_counter -= portion;
-
-            __syncthreads();
-
-            if (subTaskIdx < portion)
-            {
-                pos.x += c_dx[threadIdx.x & 7];
-                pos.y += c_dy[threadIdx.x & 7];
-
-                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
-                {
-                    map(pos.y, pos.x) = 2;
-
-                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
-
-                    s_st[ind] = pos;
-                }
-            }
-
-            __syncthreads();
-        }
-
-        if (s_counter > 0)
-        {
-            if (threadIdx.x == 0)
-            {
-                ind = ::atomicAdd(&counter, s_counter);
-                s_ind = ind - s_counter;
-            }
-
-            __syncthreads();
-
-            ind = s_ind;
-
-            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                st2[ind + i] = s_st[i];
-        }
-    }
-
-    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
-    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
-
-        int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-        while (count > 0)
-        {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-            const dim3 block(128);
-            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
-
-            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            std::swap(st1, st2);
-        }
-    }
-}
-
-//////////////////////////////////////////////////////////////////////////////////////////
-
-namespace canny
-{
-    struct GetEdges : unary_function<int, uchar>
-    {
-        __device__ __forceinline__ uchar operator ()(int e) const
-        {
-            return (uchar)(-(e >> 1));
-        }
-
-        __host__ __device__ __forceinline__ GetEdges() {}
-        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace canny
-{
-    void getEdges(PtrStepSzi map, PtrStepSzb dst)
-    {
-        transform(map, dst, GetEdges(), WithOutMask(), 0);
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/clahe.cu
+++ b/modules/gpu/src/cuda/clahe.cu
@@ -1,186 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace clahe
-{
-    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
-                                  const int2 tileSize, const int tilesX,
-                                  const int clipLimit, const float lutScale)
-    {
-        __shared__ int smem[512];
-
-        const int tx = blockIdx.x;
-        const int ty = blockIdx.y;
-        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        smem[tid] = 0;
-        __syncthreads();
-
-        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
-        {
-            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
-            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
-            {
-                const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
-            }
-        }
-
-        __syncthreads();
-
-        int tHistVal = smem[tid];
-
-        __syncthreads();
-
-        if (clipLimit > 0)
-        {
-            // clip histogram bar
-
-            int clipped = 0;
-            if (tHistVal > clipLimit)
-            {
-                clipped = tHistVal - clipLimit;
-                tHistVal = clipLimit;
-            }
-
-            // find number of overall clipped samples
-
-            reduce<256>(smem, clipped, tid, plus<int>());
-
-            // broadcast evaluated value
-
-            __shared__ int totalClipped;
-
-            if (tid == 0)
-                totalClipped = clipped;
-            __syncthreads();
-
-            // redistribute clipped samples evenly
-
-            int redistBatch = totalClipped / 256;
-            tHistVal += redistBatch;
-
-            int residual = totalClipped - redistBatch * 256;
-            if (tid < residual)
-                ++tHistVal;
-        }
-
-        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
-
-        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
-    }
-
-    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(tilesX, tilesY);
-
-        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= src.cols || y >= src.rows)
-            return;
-
-        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
-        int ty1 = __float2int_rd(tyf);
-        int ty2 = ty1 + 1;
-        const float ya = tyf - ty1;
-        ty1 = ::max(ty1, 0);
-        ty2 = ::min(ty2, tilesY - 1);
-
-        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
-        int tx1 = __float2int_rd(txf);
-        int tx2 = tx1 + 1;
-        const float xa = txf - tx1;
-        tx1 = ::max(tx1, 0);
-        tx2 = ::min(tx2, tilesX - 1);
-
-        const int srcVal = src(y, x);
-
-        float res = 0;
-
-        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
-        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
-        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
-        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
-
-        dst(y, x) = saturate_cast<uchar>(res);
-    }
-
-    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
-
-        tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif // CUDA_DISABLER
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -1,461 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/color.hpp"
-#include "cvt_color_internal.h"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
-    {
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
-    { \
-        traits::functor_type functor = traits::create_functor(); \
-        typedef typename traits::functor_type::argument_type src_t; \
-        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::cudev::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
-    }
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
-
-#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
-
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.0.cu
+++ b/modules/gpu/src/cuda/column_filter.0.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float, uchar>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.1.cu
+++ b/modules/gpu/src/cuda/column_filter.1.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float3, uchar3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.10.cu
+++ b/modules/gpu/src/cuda/column_filter.10.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float, unsigned short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.11.cu
+++ b/modules/gpu/src/cuda/column_filter.11.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float3, ushort3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.12.cu
+++ b/modules/gpu/src/cuda/column_filter.12.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float4, ushort4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.13.cu
+++ b/modules/gpu/src/cuda/column_filter.13.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float3, int3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.14.cu
+++ b/modules/gpu/src/cuda/column_filter.14.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float4, int4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.2.cu
+++ b/modules/gpu/src/cuda/column_filter.2.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.3.cu
+++ b/modules/gpu/src/cuda/column_filter.3.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.4.cu
+++ b/modules/gpu/src/cuda/column_filter.4.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float, int>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.5.cu
+++ b/modules/gpu/src/cuda/column_filter.5.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.6.cu
+++ b/modules/gpu/src/cuda/column_filter.6.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.7.cu
+++ b/modules/gpu/src/cuda/column_filter.7.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.8.cu
+++ b/modules/gpu/src/cuda/column_filter.8.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float, short>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.9.cu
+++ b/modules/gpu/src/cuda/column_filter.9.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "column_filter.h"
-
-namespace filter
-{
-    template void linearColumn<float4, short4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/column_filter.h
+++ b/modules/gpu/src/cuda/column_filter.h
@@ -1,372 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace column_filter
-{
-    #define MAX_KERNEL_SIZE 32
-
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-    template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-    {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-            const int BLOCK_DIM_X = 16;
-            const int BLOCK_DIM_Y = 16;
-            const int PATCH_PER_BLOCK = 4;
-            const int HALO_SIZE = KSIZE <= 16 ? 1 : 2;
-        #else
-            const int BLOCK_DIM_X = 16;
-            const int BLOCK_DIM_Y = 8;
-            const int PATCH_PER_BLOCK = 2;
-            const int HALO_SIZE = 2;
-        #endif
-
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-        __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X];
-
-        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-
-        if (x >= src.cols)
-            return;
-
-        const T* src_col = src.ptr() + x;
-
-        const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y;
-
-        if (blockIdx.y > 0)
-        {
-            //Upper halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x));
-        }
-        else
-        {
-            //Upper halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step));
-        }
-
-        if (blockIdx.y + 2 < gridDim.y)
-        {
-            //Main data
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x));
-
-            //Lower halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x));
-        }
-        else
-        {
-            //Main data
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step));
-
-            //Lower halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step));
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-        {
-            const int y = yStart + j * BLOCK_DIM_Y;
-
-            if (y < src.rows)
-            {
-                sum_t sum = VecTraits<sum_t>::all(0);
-
-                #pragma unroll
-                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
-
-                dst(y, x) = saturate_cast<D>(sum);
-            }
-        }
-    }
-
-    template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-    {
-        int BLOCK_DIM_X;
-        int BLOCK_DIM_Y;
-        int PATCH_PER_BLOCK;
-
-        if (cc >= 20)
-        {
-            BLOCK_DIM_X = 16;
-            BLOCK_DIM_Y = 16;
-            PATCH_PER_BLOCK = 4;
-        }
-        else
-        {
-            BLOCK_DIM_X = 16;
-            BLOCK_DIM_Y = 8;
-            PATCH_PER_BLOCK = 2;
-        }
-
-        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK));
-
-        B<T> brd(src.rows);
-
-        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-namespace filter
-{
-    template <typename T, typename D>
-    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-        static const caller_t callers[5][33] =
-        {
-            {
-                0,
-                column_filter::caller< 1, T, D, BrdColReflect101>,
-                column_filter::caller< 2, T, D, BrdColReflect101>,
-                column_filter::caller< 3, T, D, BrdColReflect101>,
-                column_filter::caller< 4, T, D, BrdColReflect101>,
-                column_filter::caller< 5, T, D, BrdColReflect101>,
-                column_filter::caller< 6, T, D, BrdColReflect101>,
-                column_filter::caller< 7, T, D, BrdColReflect101>,
-                column_filter::caller< 8, T, D, BrdColReflect101>,
-                column_filter::caller< 9, T, D, BrdColReflect101>,
-                column_filter::caller<10, T, D, BrdColReflect101>,
-                column_filter::caller<11, T, D, BrdColReflect101>,
-                column_filter::caller<12, T, D, BrdColReflect101>,
-                column_filter::caller<13, T, D, BrdColReflect101>,
-                column_filter::caller<14, T, D, BrdColReflect101>,
-                column_filter::caller<15, T, D, BrdColReflect101>,
-                column_filter::caller<16, T, D, BrdColReflect101>,
-                column_filter::caller<17, T, D, BrdColReflect101>,
-                column_filter::caller<18, T, D, BrdColReflect101>,
-                column_filter::caller<19, T, D, BrdColReflect101>,
-                column_filter::caller<20, T, D, BrdColReflect101>,
-                column_filter::caller<21, T, D, BrdColReflect101>,
-                column_filter::caller<22, T, D, BrdColReflect101>,
-                column_filter::caller<23, T, D, BrdColReflect101>,
-                column_filter::caller<24, T, D, BrdColReflect101>,
-                column_filter::caller<25, T, D, BrdColReflect101>,
-                column_filter::caller<26, T, D, BrdColReflect101>,
-                column_filter::caller<27, T, D, BrdColReflect101>,
-                column_filter::caller<28, T, D, BrdColReflect101>,
-                column_filter::caller<29, T, D, BrdColReflect101>,
-                column_filter::caller<30, T, D, BrdColReflect101>,
-                column_filter::caller<31, T, D, BrdColReflect101>,
-                column_filter::caller<32, T, D, BrdColReflect101>
-            },
-            {
-                0,
-                column_filter::caller< 1, T, D, BrdColReplicate>,
-                column_filter::caller< 2, T, D, BrdColReplicate>,
-                column_filter::caller< 3, T, D, BrdColReplicate>,
-                column_filter::caller< 4, T, D, BrdColReplicate>,
-                column_filter::caller< 5, T, D, BrdColReplicate>,
-                column_filter::caller< 6, T, D, BrdColReplicate>,
-                column_filter::caller< 7, T, D, BrdColReplicate>,
-                column_filter::caller< 8, T, D, BrdColReplicate>,
-                column_filter::caller< 9, T, D, BrdColReplicate>,
-                column_filter::caller<10, T, D, BrdColReplicate>,
-                column_filter::caller<11, T, D, BrdColReplicate>,
-                column_filter::caller<12, T, D, BrdColReplicate>,
-                column_filter::caller<13, T, D, BrdColReplicate>,
-                column_filter::caller<14, T, D, BrdColReplicate>,
-                column_filter::caller<15, T, D, BrdColReplicate>,
-                column_filter::caller<16, T, D, BrdColReplicate>,
-                column_filter::caller<17, T, D, BrdColReplicate>,
-                column_filter::caller<18, T, D, BrdColReplicate>,
-                column_filter::caller<19, T, D, BrdColReplicate>,
-                column_filter::caller<20, T, D, BrdColReplicate>,
-                column_filter::caller<21, T, D, BrdColReplicate>,
-                column_filter::caller<22, T, D, BrdColReplicate>,
-                column_filter::caller<23, T, D, BrdColReplicate>,
-                column_filter::caller<24, T, D, BrdColReplicate>,
-                column_filter::caller<25, T, D, BrdColReplicate>,
-                column_filter::caller<26, T, D, BrdColReplicate>,
-                column_filter::caller<27, T, D, BrdColReplicate>,
-                column_filter::caller<28, T, D, BrdColReplicate>,
-                column_filter::caller<29, T, D, BrdColReplicate>,
-                column_filter::caller<30, T, D, BrdColReplicate>,
-                column_filter::caller<31, T, D, BrdColReplicate>,
-                column_filter::caller<32, T, D, BrdColReplicate>
-            },
-            {
-                0,
-                column_filter::caller< 1, T, D, BrdColConstant>,
-                column_filter::caller< 2, T, D, BrdColConstant>,
-                column_filter::caller< 3, T, D, BrdColConstant>,
-                column_filter::caller< 4, T, D, BrdColConstant>,
-                column_filter::caller< 5, T, D, BrdColConstant>,
-                column_filter::caller< 6, T, D, BrdColConstant>,
-                column_filter::caller< 7, T, D, BrdColConstant>,
-                column_filter::caller< 8, T, D, BrdColConstant>,
-                column_filter::caller< 9, T, D, BrdColConstant>,
-                column_filter::caller<10, T, D, BrdColConstant>,
-                column_filter::caller<11, T, D, BrdColConstant>,
-                column_filter::caller<12, T, D, BrdColConstant>,
-                column_filter::caller<13, T, D, BrdColConstant>,
-                column_filter::caller<14, T, D, BrdColConstant>,
-                column_filter::caller<15, T, D, BrdColConstant>,
-                column_filter::caller<16, T, D, BrdColConstant>,
-                column_filter::caller<17, T, D, BrdColConstant>,
-                column_filter::caller<18, T, D, BrdColConstant>,
-                column_filter::caller<19, T, D, BrdColConstant>,
-                column_filter::caller<20, T, D, BrdColConstant>,
-                column_filter::caller<21, T, D, BrdColConstant>,
-                column_filter::caller<22, T, D, BrdColConstant>,
-                column_filter::caller<23, T, D, BrdColConstant>,
-                column_filter::caller<24, T, D, BrdColConstant>,
-                column_filter::caller<25, T, D, BrdColConstant>,
-                column_filter::caller<26, T, D, BrdColConstant>,
-                column_filter::caller<27, T, D, BrdColConstant>,
-                column_filter::caller<28, T, D, BrdColConstant>,
-                column_filter::caller<29, T, D, BrdColConstant>,
-                column_filter::caller<30, T, D, BrdColConstant>,
-                column_filter::caller<31, T, D, BrdColConstant>,
-                column_filter::caller<32, T, D, BrdColConstant>
-            },
-            {
-                0,
-                column_filter::caller< 1, T, D, BrdColReflect>,
-                column_filter::caller< 2, T, D, BrdColReflect>,
-                column_filter::caller< 3, T, D, BrdColReflect>,
-                column_filter::caller< 4, T, D, BrdColReflect>,
-                column_filter::caller< 5, T, D, BrdColReflect>,
-                column_filter::caller< 6, T, D, BrdColReflect>,
-                column_filter::caller< 7, T, D, BrdColReflect>,
-                column_filter::caller< 8, T, D, BrdColReflect>,
-                column_filter::caller< 9, T, D, BrdColReflect>,
-                column_filter::caller<10, T, D, BrdColReflect>,
-                column_filter::caller<11, T, D, BrdColReflect>,
-                column_filter::caller<12, T, D, BrdColReflect>,
-                column_filter::caller<13, T, D, BrdColReflect>,
-                column_filter::caller<14, T, D, BrdColReflect>,
-                column_filter::caller<15, T, D, BrdColReflect>,
-                column_filter::caller<16, T, D, BrdColReflect>,
-                column_filter::caller<17, T, D, BrdColReflect>,
-                column_filter::caller<18, T, D, BrdColReflect>,
-                column_filter::caller<19, T, D, BrdColReflect>,
-                column_filter::caller<20, T, D, BrdColReflect>,
-                column_filter::caller<21, T, D, BrdColReflect>,
-                column_filter::caller<22, T, D, BrdColReflect>,
-                column_filter::caller<23, T, D, BrdColReflect>,
-                column_filter::caller<24, T, D, BrdColReflect>,
-                column_filter::caller<25, T, D, BrdColReflect>,
-                column_filter::caller<26, T, D, BrdColReflect>,
-                column_filter::caller<27, T, D, BrdColReflect>,
-                column_filter::caller<28, T, D, BrdColReflect>,
-                column_filter::caller<29, T, D, BrdColReflect>,
-                column_filter::caller<30, T, D, BrdColReflect>,
-                column_filter::caller<31, T, D, BrdColReflect>,
-                column_filter::caller<32, T, D, BrdColReflect>
-            },
-            {
-                0,
-                column_filter::caller< 1, T, D, BrdColWrap>,
-                column_filter::caller< 2, T, D, BrdColWrap>,
-                column_filter::caller< 3, T, D, BrdColWrap>,
-                column_filter::caller< 4, T, D, BrdColWrap>,
-                column_filter::caller< 5, T, D, BrdColWrap>,
-                column_filter::caller< 6, T, D, BrdColWrap>,
-                column_filter::caller< 7, T, D, BrdColWrap>,
-                column_filter::caller< 8, T, D, BrdColWrap>,
-                column_filter::caller< 9, T, D, BrdColWrap>,
-                column_filter::caller<10, T, D, BrdColWrap>,
-                column_filter::caller<11, T, D, BrdColWrap>,
-                column_filter::caller<12, T, D, BrdColWrap>,
-                column_filter::caller<13, T, D, BrdColWrap>,
-                column_filter::caller<14, T, D, BrdColWrap>,
-                column_filter::caller<15, T, D, BrdColWrap>,
-                column_filter::caller<16, T, D, BrdColWrap>,
-                column_filter::caller<17, T, D, BrdColWrap>,
-                column_filter::caller<18, T, D, BrdColWrap>,
-                column_filter::caller<19, T, D, BrdColWrap>,
-                column_filter::caller<20, T, D, BrdColWrap>,
-                column_filter::caller<21, T, D, BrdColWrap>,
-                column_filter::caller<22, T, D, BrdColWrap>,
-                column_filter::caller<23, T, D, BrdColWrap>,
-                column_filter::caller<24, T, D, BrdColWrap>,
-                column_filter::caller<25, T, D, BrdColWrap>,
-                column_filter::caller<26, T, D, BrdColWrap>,
-                column_filter::caller<27, T, D, BrdColWrap>,
-                column_filter::caller<28, T, D, BrdColWrap>,
-                column_filter::caller<29, T, D, BrdColWrap>,
-                column_filter::caller<30, T, D, BrdColWrap>,
-                column_filter::caller<31, T, D, BrdColWrap>,
-                column_filter::caller<32, T, D, BrdColWrap>
-            }
-        };
-
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-    }
-}
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@@ -1,131 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, PtrStepSz<T> dst, int top, int left)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = src(y - top, x - left);
-        }
-
-        template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
-        {
-            static void call(const PtrStepSz<T>& src, const PtrStepSz<T>& dst, int top, int left,
-                const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-                BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-                copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T, int cn> void copyMakeBorder_gpu(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode,
-            const T* borderValue, cudaStream_t stream)
-        {
-            typedef typename TypeVec<T, cn>::vec_type vec_type;
-
-            typedef void (*caller_t)(const PtrStepSz<vec_type>& src, const PtrStepSz<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
-
-            static const caller_t callers[5] =
-            {
-                CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdConstant, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdReflect, vec_type>::call,
-                CopyMakeBorderDispatcher<BrdWrap, vec_type>::call
-            };
-
-            callers[borderMode](PtrStepSz<vec_type>(src), PtrStepSz<vec_type>(dst), top, left, borderValue, stream);
-        }
-
-        template void copyMakeBorder_gpu<uchar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<uchar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-
-        //template void copyMakeBorder_gpu<schar, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<schar, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<ushort, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<ushort, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<ushort, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<short, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<short, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<short, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-
-        //template void copyMakeBorder_gpu<int, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<int, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-
-        template void copyMakeBorder_gpu<float, 1>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        //template void copyMakeBorder_gpu<float, 2>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 3>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-        template void copyMakeBorder_gpu<float, 4>(const PtrStepSzb& src, const PtrStepSzb& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -1,544 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/color.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <typename T> struct Bayer2BGR;
-
-    template <> struct Bayer2BGR<uchar>
-    {
-        uchar3 res0;
-        uchar3 res1;
-        uchar3 res2;
-        uchar3 res3;
-
-        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
-        {
-            uchar4 patch[3][3];
-            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
-            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
-            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
-            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
-
-            if ((s_y & 1) ^ start_with_green)
-            {
-                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
-                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
-
-                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
-                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
-
-                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
-                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
-
-                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
-                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = t1;
-                    res0.y = patch[1][1].x;
-                    res0.z = t0;
-
-                    res1.x = patch[1][1].y;
-                    res1.y = t3;
-                    res1.z = t2;
-
-                    res2.x = t5;
-                    res2.y = patch[1][1].z;
-                    res2.z = t4;
-
-                    res3.x = patch[1][1].w;
-                    res3.y = t7;
-                    res3.z = t6;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = patch[1][1].x;
-                    res0.z = t1;
-
-                    res1.x = t2;
-                    res1.y = t3;
-                    res1.z = patch[1][1].y;
-
-                    res2.x = t4;
-                    res2.y = patch[1][1].z;
-                    res2.z = t5;
-
-                    res3.x = t6;
-                    res3.y = t7;
-                    res3.z = patch[1][1].w;
-                }
-            }
-            else
-            {
-                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
-                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
-
-                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
-                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
-
-                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
-                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
-
-                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
-                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = patch[1][1].x;
-                    res0.y = t1;
-                    res0.z = t0;
-
-                    res1.x = t3;
-                    res1.y = patch[1][1].y;
-                    res1.z = t2;
-
-                    res2.x = patch[1][1].z;
-                    res2.y = t5;
-                    res2.z = t4;
-
-                    res3.x = t7;
-                    res3.y = patch[1][1].w;
-                    res3.z = t6;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = t1;
-                    res0.z = patch[1][1].x;
-
-                    res1.x = t2;
-                    res1.y = patch[1][1].y;
-                    res1.z = t3;
-
-                    res2.x = t4;
-                    res2.y = t5;
-                    res2.z = patch[1][1].z;
-
-                    res3.x = t6;
-                    res3.y = patch[1][1].w;
-                    res3.z = t7;
-                }
-            }
-        }
-    };
-
-    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
-    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
-    {
-        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
-        return f(pix);
-    }
-    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
-    {
-        return pix;
-    }
-    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
-    {
-        return make_uchar4(pix.x, pix.y, pix.z, 255);
-    }
-
-    template <typename D>
-    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
-    {
-        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (s_y >= src.rows || (s_x << 2) >= src.cols)
-            return;
-
-        s_y = ::min(::max(s_y, 1), src.rows - 2);
-
-        Bayer2BGR<uchar> bayer;
-        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
-
-        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        dst(d_y, d_x) = toDst<D>(bayer.res0);
-        if (d_x + 1 < src.cols)
-            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
-        if (d_x + 2 < src.cols)
-            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
-        if (d_x + 3 < src.cols)
-            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
-    }
-
-    template <> struct Bayer2BGR<ushort>
-    {
-        ushort3 res0;
-        ushort3 res1;
-
-        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
-        {
-            ushort2 patch[3][3];
-            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
-            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
-            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
-            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
-            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
-            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
-            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
-
-            if ((s_y & 1) ^ start_with_green)
-            {
-                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
-                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
-
-                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
-                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = t1;
-                    res0.y = patch[1][1].x;
-                    res0.z = t0;
-
-                    res1.x = patch[1][1].y;
-                    res1.y = t3;
-                    res1.z = t2;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = patch[1][1].x;
-                    res0.z = t1;
-
-                    res1.x = t2;
-                    res1.y = t3;
-                    res1.z = patch[1][1].y;
-                }
-            }
-            else
-            {
-                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
-                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
-
-                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
-                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
-
-                if ((s_y & 1) ^ blue_last)
-                {
-                    res0.x = patch[1][1].x;
-                    res0.y = t1;
-                    res0.z = t0;
-
-                    res1.x = t3;
-                    res1.y = patch[1][1].y;
-                    res1.z = t2;
-                }
-                else
-                {
-                    res0.x = t0;
-                    res0.y = t1;
-                    res0.z = patch[1][1].x;
-
-                    res1.x = t2;
-                    res1.y = patch[1][1].y;
-                    res1.z = t3;
-                }
-            }
-        }
-    };
-
-    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
-    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
-    {
-        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
-        return f(pix);
-    }
-    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
-    {
-        return pix;
-    }
-    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
-    {
-        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
-    }
-
-    template <typename D>
-    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
-    {
-        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
-        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (s_y >= src.rows || (s_x << 1) >= src.cols)
-            return;
-
-        s_y = ::min(::max(s_y, 1), src.rows - 2);
-
-        Bayer2BGR<ushort> bayer;
-        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
-
-        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
-        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        dst(d_y, d_x) = toDst<D>(bayer.res0);
-        if (d_x + 1 < src.cols)
-            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
-    }
-
-    template <int cn>
-    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-    {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
-
-        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <int cn>
-    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
-    {
-        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
-
-        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-
-    //////////////////////////////////////////////////////////////
-    // Bayer Demosaicing (Malvar, He, and Cutler)
-    //
-    // by Morgan McGuire, Williams College
-    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
-    //
-    // ported to CUDA
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    template <typename DstType>
-    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
-    {
-        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
-        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
-        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
-        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
-        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
-        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
-            return;
-
-        int2 center;
-        center.x = x + sourceOffset.x;
-        center.y = y + sourceOffset.y;
-
-        int4 xCoord;
-        xCoord.x = center.x - 2;
-        xCoord.y = center.x - 1;
-        xCoord.z = center.x + 1;
-        xCoord.w = center.x + 2;
-
-        int4 yCoord;
-        yCoord.x = center.y - 2;
-        yCoord.y = center.y - 1;
-        yCoord.z = center.y + 1;
-        yCoord.w = center.y + 2;
-
-        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
-
-        float4 Dvec;
-        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
-        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
-        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
-        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
-
-        float4 value;
-        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
-        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
-        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
-        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
-
-        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
-        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
-        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
-        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
-        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
-
-        float4 PATTERN;
-        PATTERN.x = kCx * C;
-        PATTERN.y = kCy * C;
-        PATTERN.z = kCz * C;
-        PATTERN.w = PATTERN.z;
-
-        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
-
-        // There are five filter patterns (identity, cross, checker,
-        // theta, phi). Precompute the terms from all of them and then
-        // use swizzles to assign to color channels.
-        //
-        // Channel Matches
-        // x cross (e.g., EE G)
-        // y checker (e.g., EE B)
-        // z theta (e.g., EO R)
-        // w phi (e.g., EO B)
-
-        #define A value.x  // A0 + A1
-        #define B value.y  // B0 + B1
-        #define E value.z  // E0 + E1
-        #define F value.w  // F0 + F1
-
-        float3 temp;
-
-        // PATTERN.yzw += (kD.yz * D).xyy;
-        temp.x = kDy * D;
-        temp.y = kDz * D;
-        PATTERN.y += temp.x;
-        PATTERN.z += temp.y;
-        PATTERN.w += temp.y;
-
-        // PATTERN += (kA.xyz * A).xyzx;
-        temp.x = kAx * A;
-        temp.y = kAy * A;
-        temp.z = kAz * A;
-        PATTERN.x += temp.x;
-        PATTERN.y += temp.y;
-        PATTERN.z += temp.z;
-        PATTERN.w += temp.x;
-
-        // PATTERN += (kE.xyw * E).xyxz;
-        temp.x = kEx * E;
-        temp.y = kEy * E;
-        temp.z = kEw * E;
-        PATTERN.x += temp.x;
-        PATTERN.y += temp.y;
-        PATTERN.z += temp.x;
-        PATTERN.w += temp.z;
-
-        // PATTERN.xw += kB.xw * B;
-        PATTERN.x += kBx * B;
-        PATTERN.w += kBw * B;
-
-        // PATTERN.xz += kF.xz * F;
-        PATTERN.x += kFx * F;
-        PATTERN.z += kFz * F;
-
-        // Determine which of four types of pixels we are on.
-        int2 alternate;
-        alternate.x = (x + firstRed.x) % 2;
-        alternate.y = (y + firstRed.y) % 2;
-
-        // in BGR sequence;
-        uchar3 pixelColor =
-            (alternate.y == 0) ?
-                ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
-                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
-                ((alternate.x == 0) ?
-                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
-                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
-
-        dst(y, x) = toDst<DstType>(pixelColor);
-    }
-
-    template <int cn>
-    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
-    {
-        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        bindTexture(&sourceTex, src);
-
-        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/disp_bilateral_filter.cu
+++ b/modules/gpu/src/cuda/disp_bilateral_filter.cu
@@ -1,223 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace disp_bilateral_filter
-    {
-        __constant__ float* ctable_color;
-        __constant__ float* ctable_space;
-        __constant__ size_t ctable_space_step;
-
-        __constant__ int cndisp;
-        __constant__ int cradius;
-
-        __constant__ short cedge_disc;
-        __constant__ short cmax_disc;
-
-        void disp_load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
-            size_t table_space_step = table_space.step / sizeof(float);
-            cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
-        }
-
-        template <int channels>
-        struct DistRgbMax
-        {
-            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-            {
-                uchar x = ::abs(a[0] - b[0]);
-                uchar y = ::abs(a[1] - b[1]);
-                uchar z = ::abs(a[2] - b[2]);
-                return (::max(::max(x, y), z));
-            }
-        };
-
-        template <>
-        struct DistRgbMax<1>
-        {
-            static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-            {
-                return ::abs(a[0] - b[0]);
-            }
-        };
-
-        template <int channels, typename T>
-        __global__ void disp_bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-        {
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-            T dp[5];
-
-            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-            {
-                dp[0] = *(disp + (y  ) * disp_step + x + 0);
-                dp[1] = *(disp + (y-1) * disp_step + x + 0);
-                dp[2] = *(disp + (y  ) * disp_step + x - 1);
-                dp[3] = *(disp + (y+1) * disp_step + x + 0);
-                dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-                if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)
-                {
-                    const int ymin = ::max(0, y - cradius);
-                    const int xmin = ::max(0, x - cradius);
-                    const int ymax = ::min(h - 1, y + cradius);
-                    const int xmax = ::min(w - 1, x + cradius);
-
-                    float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-                    const uchar* ic = img + y * img_step + channels * x;
-
-                    for(int yi = ymin; yi <= ymax; yi++)
-                    {
-                        const T* disp_y = disp + yi * disp_step;
-
-                        for(int xi = xmin; xi <= xmax; xi++)
-                        {
-                            const uchar* in = img + yi * img_step + channels * xi;
-
-                            uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
-
-                            const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];
-
-                            const T disp_reg = disp_y[xi];
-
-                            cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
-                            cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
-                            cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
-                            cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
-                            cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
-                        }
-                    }
-
-                    float minimum = numeric_limits<float>::max();
-                    int id = 0;
-
-                    if (cost[0] < minimum)
-                    {
-                        minimum = cost[0];
-                        id = 0;
-                    }
-                    if (cost[1] < minimum)
-                    {
-                        minimum = cost[1];
-                        id = 1;
-                    }
-                    if (cost[2] < minimum)
-                    {
-                        minimum = cost[2];
-                        id = 2;
-                    }
-                    if (cost[3] < minimum)
-                    {
-                        minimum = cost[3];
-                        id = 3;
-                    }
-                    if (cost[4] < minimum)
-                    {
-                        minimum = cost[4];
-                        id = 4;
-                    }
-
-                    *(disp + y * disp_step + x) = dp[id];
-                }
-            }
-        }
-
-        template <typename T>
-        void disp_bilateral_filter(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-            grid.x = divUp(disp.cols, threads.x << 1);
-            grid.y = divUp(disp.rows, threads.y);
-
-            switch (channels)
-            {
-            case 1:
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-                }
-                break;
-            case 3:
-                for (int i = 0; i < iters; ++i)
-                {
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-
-                    disp_bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                    cudaSafeCall( cudaGetLastError() );
-                }
-                break;
-            default:
-                CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void disp_bilateral_filter<uchar>(PtrStepSz<uchar> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-        template void disp_bilateral_filter<short>(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-    } // namespace bilateral_filter
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
@@ -1,801 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "fgd_bgfg_common.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace bgfg
-{
-    ////////////////////////////////////////////////////////////////////////////
-    // calcDiffHistogram
-
-    const unsigned int UINT_BITS = 32U;
-    const int LOG_WARP_SIZE = 5;
-    const int WARP_SIZE = 1 << LOG_WARP_SIZE;
-#if (__CUDA_ARCH__ < 120)
-    const unsigned int TAG_MASK = (1U << (UINT_BITS - LOG_WARP_SIZE)) - 1U;
-#endif
-
-    const int MERGE_THREADBLOCK_SIZE = 256;
-
-    __device__ __forceinline__ void addByte(unsigned int* s_WarpHist_, unsigned int data, unsigned int threadTag)
-    {
-        #if (__CUDA_ARCH__ < 120)
-            volatile unsigned int* s_WarpHist = s_WarpHist_;
-            unsigned int count;
-            do
-            {
-                count = s_WarpHist[data] & TAG_MASK;
-                count = threadTag | (count + 1);
-                s_WarpHist[data] = count;
-            } while (s_WarpHist[data] != count);
-        #else
-            atomicInc(s_WarpHist_ + data, (unsigned int)(-1));
-        #endif
-    }
-
-
-    template <typename PT, typename CT>
-    __global__ void calcPartialHistogram(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2)
-    {
-#if (__CUDA_ARCH__ < 200)
-        const int HISTOGRAM_WARP_COUNT = 4;
-#else
-        const int HISTOGRAM_WARP_COUNT = 6;
-#endif
-        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
-        const int HISTOGRAM_THREADBLOCK_MEMORY = HISTOGRAM_WARP_COUNT * HISTOGRAM_BIN_COUNT;
-
-        //Per-warp subhistogram storage
-        __shared__ unsigned int s_Hist0[HISTOGRAM_THREADBLOCK_MEMORY];
-        __shared__ unsigned int s_Hist1[HISTOGRAM_THREADBLOCK_MEMORY];
-        __shared__ unsigned int s_Hist2[HISTOGRAM_THREADBLOCK_MEMORY];
-
-        //Clear shared memory storage for current threadblock before processing
-        #pragma unroll
-        for (int i = 0; i < (HISTOGRAM_THREADBLOCK_MEMORY / HISTOGRAM_THREADBLOCK_SIZE); ++i)
-        {
-           s_Hist0[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-           s_Hist1[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-           s_Hist2[threadIdx.x + i * HISTOGRAM_THREADBLOCK_SIZE] = 0;
-        }
-        __syncthreads();
-
-        const unsigned int warpId = threadIdx.x >> LOG_WARP_SIZE;
-
-        unsigned int* s_WarpHist0 = s_Hist0 + warpId * HISTOGRAM_BIN_COUNT;
-        unsigned int* s_WarpHist1 = s_Hist1 + warpId * HISTOGRAM_BIN_COUNT;
-        unsigned int* s_WarpHist2 = s_Hist2 + warpId * HISTOGRAM_BIN_COUNT;
-
-        const unsigned int tag = threadIdx.x << (UINT_BITS - LOG_WARP_SIZE);
-        const int dataCount = prevFrame.rows * prevFrame.cols;
-        for (unsigned int pos = blockIdx.x * HISTOGRAM_THREADBLOCK_SIZE + threadIdx.x; pos < dataCount; pos += HISTOGRAM_THREADBLOCK_SIZE * PARTIAL_HISTOGRAM_COUNT)
-        {
-            const unsigned int y = pos / prevFrame.cols;
-            const unsigned int x = pos % prevFrame.cols;
-
-            PT prevVal = prevFrame(y, x);
-            CT curVal = curFrame(y, x);
-
-            int3 diff = make_int3(
-                ::abs(curVal.x - prevVal.x),
-                ::abs(curVal.y - prevVal.y),
-                ::abs(curVal.z - prevVal.z)
-            );
-
-            addByte(s_WarpHist0, diff.x, tag);
-            addByte(s_WarpHist1, diff.y, tag);
-            addByte(s_WarpHist2, diff.z, tag);
-        }
-        __syncthreads();
-
-        //Merge per-warp histograms into per-block and write to global memory
-        for (unsigned int bin = threadIdx.x; bin < HISTOGRAM_BIN_COUNT; bin += HISTOGRAM_THREADBLOCK_SIZE)
-        {
-            unsigned int sum0 = 0;
-            unsigned int sum1 = 0;
-            unsigned int sum2 = 0;
-
-            #pragma unroll
-            for (int i = 0; i < HISTOGRAM_WARP_COUNT; ++i)
-            {
-                #if (__CUDA_ARCH__ < 120)
-                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT] & TAG_MASK;
-                #else
-                    sum0 += s_Hist0[bin + i * HISTOGRAM_BIN_COUNT];
-                    sum1 += s_Hist1[bin + i * HISTOGRAM_BIN_COUNT];
-                    sum2 += s_Hist2[bin + i * HISTOGRAM_BIN_COUNT];
-                #endif
-            }
-
-            partialBuf0[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum0;
-            partialBuf1[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum1;
-            partialBuf2[blockIdx.x * HISTOGRAM_BIN_COUNT + bin] = sum2;
-        }
-    }
-
-    __global__ void mergeHistogram(const unsigned int* partialBuf0, const unsigned int* partialBuf1, const unsigned int* partialBuf2, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2)
-    {
-        unsigned int sum0 = 0;
-        unsigned int sum1 = 0;
-        unsigned int sum2 = 0;
-
-        #pragma unroll
-        for (unsigned int i = threadIdx.x; i < PARTIAL_HISTOGRAM_COUNT; i += MERGE_THREADBLOCK_SIZE)
-        {
-            sum0 += partialBuf0[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-            sum1 += partialBuf1[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-            sum2 += partialBuf2[blockIdx.x + i * HISTOGRAM_BIN_COUNT];
-        }
-
-        __shared__ unsigned int data0[MERGE_THREADBLOCK_SIZE];
-        __shared__ unsigned int data1[MERGE_THREADBLOCK_SIZE];
-        __shared__ unsigned int data2[MERGE_THREADBLOCK_SIZE];
-
-        plus<unsigned int> op;
-        reduce<MERGE_THREADBLOCK_SIZE>(smem_tuple(data0, data1, data2), thrust::tie(sum0, sum1, sum2), threadIdx.x, thrust::make_tuple(op, op, op));
-
-        if(threadIdx.x == 0)
-        {
-            hist0[blockIdx.x] = sum0;
-            hist1[blockIdx.x] = sum1;
-            hist2[blockIdx.x] = sum2;
-        }
-    }
-
-    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame,
-                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
-                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               bool cc20, cudaStream_t stream)
-    {
-        const int HISTOGRAM_WARP_COUNT = cc20 ? 6 : 4;
-        const int HISTOGRAM_THREADBLOCK_SIZE = HISTOGRAM_WARP_COUNT * WARP_SIZE;
-
-        calcPartialHistogram<PT, CT><<<PARTIAL_HISTOGRAM_COUNT, HISTOGRAM_THREADBLOCK_SIZE, 0, stream>>>(
-                (PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, partialBuf0, partialBuf1, partialBuf2);
-        cudaSafeCall( cudaGetLastError() );
-
-        mergeHistogram<<<HISTOGRAM_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(partialBuf0, partialBuf1, partialBuf2, hist0, hist1, hist2);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calcDiffHistogram_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-    template void calcDiffHistogram_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-
-    /////////////////////////////////////////////////////////////////////////
-    // calcDiffThreshMask
-
-    template <typename PT, typename CT>
-    __global__ void calcDiffThreshMask(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame, uchar3 bestThres, PtrStepb changeMask)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (y > prevFrame.rows || x > prevFrame.cols)
-            return;
-
-        PT prevVal = prevFrame(y, x);
-        CT curVal = curFrame(y, x);
-
-        int3 diff = make_int3(
-            ::abs(curVal.x - prevVal.x),
-            ::abs(curVal.y - prevVal.y),
-            ::abs(curVal.z - prevVal.z)
-        );
-
-        if (diff.x > bestThres.x || diff.y > bestThres.y || diff.z > bestThres.z)
-            changeMask(y, x) = 255;
-    }
-
-    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-        calcDiffThreshMask<PT, CT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame, bestThres, changeMask);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calcDiffThreshMask_gpu<uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-    template void calcDiffThreshMask_gpu<uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, uchar3 bestThres, PtrStepSzb changeMask, cudaStream_t stream);
-
-    /////////////////////////////////////////////////////////////////////////
-    // bgfgClassification
-
-    __constant__ BGPixelStat c_stat;
-
-    void setBGPixelStat(const BGPixelStat& stat)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_stat, &stat, sizeof(BGPixelStat)) );
-    }
-
-    template <typename T> struct Output;
-    template <> struct Output<uchar3>
-    {
-        static __device__ __forceinline__ uchar3 make(uchar v0, uchar v1, uchar v2)
-        {
-            return make_uchar3(v0, v1, v2);
-        }
-    };
-    template <> struct Output<uchar4>
-    {
-        static __device__ __forceinline__ uchar4 make(uchar v0, uchar v1, uchar v2)
-        {
-            return make_uchar4(v0, v1, v2, 255);
-        }
-    };
-
-    template <typename PT, typename CT, typename OT>
-    __global__ void bgfgClassification(const PtrStepSz<PT> prevFrame, const PtrStep<CT> curFrame,
-                                       const PtrStepb Ftd, const PtrStepb Fbd, PtrStepb foreground,
-                                       int deltaC, int deltaCC, float alpha2, int N1c, int N1cc)
-    {
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (i > prevFrame.rows || j > prevFrame.cols)
-            return;
-
-        if (Fbd(i, j) || Ftd(i, j))
-        {
-            float Pb  = 0.0f;
-            float Pv  = 0.0f;
-            float Pvb = 0.0f;
-
-            int val = 0;
-
-            // Is it a motion pixel?
-            if (Ftd(i, j))
-            {
-                if (!c_stat.is_trained_dyn_model(i, j))
-                    val = 1;
-                else
-                {
-                    PT prevVal = prevFrame(i, j);
-                    CT curVal = curFrame(i, j);
-
-                    // Compare with stored CCt vectors:
-                    for (int k = 0; k < N1cc && c_stat.PV_CC(i, j, k) > alpha2; ++k)
-                    {
-                        OT v1 = c_stat.V1_CC<OT>(i, j, k);
-                        OT v2 = c_stat.V2_CC<OT>(i, j, k);
-
-                        if (::abs(v1.x - prevVal.x) <= deltaCC &&
-                            ::abs(v1.y - prevVal.y) <= deltaCC &&
-                            ::abs(v1.z - prevVal.z) <= deltaCC &&
-                            ::abs(v2.x - curVal.x) <= deltaCC &&
-                            ::abs(v2.y - curVal.y) <= deltaCC &&
-                            ::abs(v2.z - curVal.z) <= deltaCC)
-                        {
-                            Pv += c_stat.PV_CC(i, j, k);
-                            Pvb += c_stat.PVB_CC(i, j, k);
-                        }
-                    }
-
-                    Pb = c_stat.Pbcc(i, j);
-                    if (2 * Pvb * Pb <= Pv)
-                        val = 1;
-                }
-            }
-            else if(c_stat.is_trained_st_model(i, j))
-            {
-                CT curVal = curFrame(i, j);
-
-                // Compare with stored Ct vectors:
-                for (int k = 0; k < N1c && c_stat.PV_C(i, j, k) > alpha2; ++k)
-                {
-                    OT v = c_stat.V_C<OT>(i, j, k);
-
-                    if (::abs(v.x - curVal.x) <= deltaC &&
-                        ::abs(v.y - curVal.y) <= deltaC &&
-                        ::abs(v.z - curVal.z) <= deltaC)
-                    {
-                        Pv += c_stat.PV_C(i, j, k);
-                        Pvb += c_stat.PVB_C(i, j, k);
-                    }
-                }
-                Pb = c_stat.Pbc(i, j);
-                if (2 * Pvb * Pb <= Pv)
-                    val = 1;
-            }
-
-            // Update foreground:
-            foreground(i, j) = static_cast<uchar>(val);
-        } // end if( change detection...
-    }
-
-    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground,
-                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-        cudaSafeCall( cudaFuncSetCacheConfig(bgfgClassification<PT, CT, OT>, cudaFuncCachePreferL1) );
-
-        bgfgClassification<PT, CT, OT><<<grid, block, 0, stream>>>((PtrStepSz<PT>)prevFrame, (PtrStepSz<CT>)curFrame,
-                                                                   Ftd, Fbd, foreground,
-                                                                   deltaC, deltaCC, alpha2, N1c, N1cc);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-    template void bgfgClassification_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-
-    ////////////////////////////////////////////////////////////////////////////
-    // updateBackgroundModel
-
-    template <typename PT, typename CT, typename OT, class PrevFramePtr2D, class CurFramePtr2D, class FtdPtr2D, class FbdPtr2D>
-    __global__ void updateBackgroundModel(int cols, int rows, const PrevFramePtr2D prevFrame, const CurFramePtr2D curFrame, const FtdPtr2D Ftd, const FbdPtr2D Fbd,
-                                          PtrStepb foreground, PtrStep<OT> background,
-                                          int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T)
-    {
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-
-        if (i > rows || j > cols)
-            return;
-
-        const float MIN_PV = 1e-10f;
-
-        const uchar is_trained_dyn_model = c_stat.is_trained_dyn_model(i, j);
-        if (Ftd(i, j) || !is_trained_dyn_model)
-        {
-            const float alpha = is_trained_dyn_model ? alpha2 : alpha3;
-
-            float Pbcc = c_stat.Pbcc(i, j);
-
-            //update Pb
-            Pbcc *= (1.0f - alpha);
-            if (!foreground(i, j))
-            {
-                Pbcc += alpha;
-            }
-
-            int min_dist = numeric_limits<int>::max();
-            int indx = -1;
-
-            PT prevVal = prevFrame(i, j);
-            CT curVal = curFrame(i, j);
-
-            // Find best Vi match:
-            for (int k = 0; k < N2cc; ++k)
-            {
-                float PV_CC = c_stat.PV_CC(i, j, k);
-                if (!PV_CC)
-                    break;
-
-                if (PV_CC < MIN_PV)
-                {
-                    c_stat.PV_CC(i, j, k) = 0;
-                    c_stat.PVB_CC(i, j, k) = 0;
-                    continue;
-                }
-
-                c_stat.PV_CC(i, j, k) = PV_CC * (1.0f - alpha);
-                c_stat.PVB_CC(i, j, k) = c_stat.PVB_CC(i, j, k) * (1.0f - alpha);
-
-                OT v1 = c_stat.V1_CC<OT>(i, j, k);
-
-                int3 val1 = make_int3(
-                    ::abs(v1.x - prevVal.x),
-                    ::abs(v1.y - prevVal.y),
-                    ::abs(v1.z - prevVal.z)
-                );
-
-                OT v2 = c_stat.V2_CC<OT>(i, j, k);
-
-                int3 val2 = make_int3(
-                    ::abs(v2.x - curVal.x),
-                    ::abs(v2.y - curVal.y),
-                    ::abs(v2.z - curVal.z)
-                );
-
-                int dist = val1.x + val1.y + val1.z + val2.x + val2.y + val2.z;
-
-                if (dist < min_dist &&
-                    val1.x <= deltaCC && val1.y <= deltaCC && val1.z <= deltaCC &&
-                    val2.x <= deltaCC && val2.y <= deltaCC && val2.z <= deltaCC)
-                {
-                    min_dist = dist;
-                    indx = k;
-                }
-            }
-
-            if (indx < 0)
-            {
-                // Replace N2th elem in the table by new feature:
-                indx = N2cc - 1;
-                c_stat.PV_CC(i, j, indx) = alpha;
-                c_stat.PVB_CC(i, j, indx) = alpha;
-
-                //udate Vt
-                c_stat.V1_CC<OT>(i, j, indx) = Output<OT>::make(prevVal.x, prevVal.y, prevVal.z);
-                c_stat.V2_CC<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-            else
-            {
-                // Update:
-                c_stat.PV_CC(i, j, indx) += alpha;
-
-                if (!foreground(i, j))
-                {
-                    c_stat.PVB_CC(i, j, indx) += alpha;
-                }
-            }
-
-            //re-sort CCt table by Pv
-            const float PV_CC_indx = c_stat.PV_CC(i, j, indx);
-            const float PVB_CC_indx = c_stat.PVB_CC(i, j, indx);
-            const OT V1_CC_indx = c_stat.V1_CC<OT>(i, j, indx);
-            const OT V2_CC_indx = c_stat.V2_CC<OT>(i, j, indx);
-            for (int k = 0; k < indx; ++k)
-            {
-                if (c_stat.PV_CC(i, j, k) <= PV_CC_indx)
-                {
-                    //shift elements
-                    float Pv_tmp1;
-                    float Pv_tmp2 = PV_CC_indx;
-
-                    float Pvb_tmp1;
-                    float Pvb_tmp2 = PVB_CC_indx;
-
-                    OT v1_tmp1;
-                    OT v1_tmp2 = V1_CC_indx;
-
-                    OT v2_tmp1;
-                    OT v2_tmp2 = V2_CC_indx;
-
-                    for (int l = k; l <= indx; ++l)
-                    {
-                        Pv_tmp1 = c_stat.PV_CC(i, j, l);
-                        c_stat.PV_CC(i, j, l) = Pv_tmp2;
-                        Pv_tmp2 = Pv_tmp1;
-
-                        Pvb_tmp1 = c_stat.PVB_CC(i, j, l);
-                        c_stat.PVB_CC(i, j, l) = Pvb_tmp2;
-                        Pvb_tmp2 = Pvb_tmp1;
-
-                        v1_tmp1 = c_stat.V1_CC<OT>(i, j, l);
-                        c_stat.V1_CC<OT>(i, j, l) = v1_tmp2;
-                        v1_tmp2 = v1_tmp1;
-
-                        v2_tmp1 = c_stat.V2_CC<OT>(i, j, l);
-                        c_stat.V2_CC<OT>(i, j, l) = v2_tmp2;
-                        v2_tmp2 = v2_tmp1;
-                    }
-
-                    break;
-                }
-            }
-
-            float sum1 = 0.0f;
-            float sum2 = 0.0f;
-
-            //check "once-off" changes
-            for (int k = 0; k < N1cc; ++k)
-            {
-                const float PV_CC = c_stat.PV_CC(i, j, k);
-                if (!PV_CC)
-                    break;
-
-                sum1 += PV_CC;
-                sum2 += c_stat.PVB_CC(i, j, k);
-            }
-
-            if (sum1 > T)
-                c_stat.is_trained_dyn_model(i, j) = 1;
-
-            float diff = sum1 - Pbcc * sum2;
-
-            // Update stat table:
-            if (diff > T)
-            {
-                //new BG features are discovered
-                for (int k = 0; k < N1cc; ++k)
-                {
-                    const float PV_CC = c_stat.PV_CC(i, j, k);
-                    if (!PV_CC)
-                        break;
-
-                    c_stat.PVB_CC(i, j, k) = (PV_CC - Pbcc * c_stat.PVB_CC(i, j, k)) / (1.0f - Pbcc);
-                }
-            }
-
-            c_stat.Pbcc(i, j) = Pbcc;
-        }
-
-        // Handle "stationary" pixel:
-        if (!Ftd(i, j))
-        {
-            const float alpha = c_stat.is_trained_st_model(i, j) ? alpha2 : alpha3;
-
-            float Pbc = c_stat.Pbc(i, j);
-
-            //update Pb
-            Pbc *= (1.0f - alpha);
-            if (!foreground(i, j))
-            {
-                Pbc += alpha;
-            }
-
-            int min_dist = numeric_limits<int>::max();
-            int indx = -1;
-
-            CT curVal = curFrame(i, j);
-
-            //find best Vi match
-            for (int k = 0; k < N2c; ++k)
-            {
-                float PV_C = c_stat.PV_C(i, j, k);
-
-                if (PV_C < MIN_PV)
-                {
-                    c_stat.PV_C(i, j, k) = 0;
-                    c_stat.PVB_C(i, j, k) = 0;
-                    continue;
-                }
-
-                // Exponential decay of memory
-                c_stat.PV_C(i, j, k) = PV_C * (1.0f - alpha);
-                c_stat.PVB_C(i, j, k) = c_stat.PVB_C(i, j, k) * (1.0f - alpha);
-
-                OT v = c_stat.V_C<OT>(i, j, k);
-                int3 val = make_int3(
-                    ::abs(v.x - curVal.x),
-                    ::abs(v.y - curVal.y),
-                    ::abs(v.z - curVal.z)
-                );
-
-                int dist = val.x + val.y + val.z;
-
-                if (dist < min_dist && val.x <= deltaC && val.y <= deltaC && val.z <= deltaC)
-                {
-                    min_dist = dist;
-                    indx = k;
-                }
-            }
-
-            if (indx < 0)
-            {
-                //N2th elem in the table is replaced by a new features
-                indx = N2c - 1;
-
-                c_stat.PV_C(i, j, indx) = alpha;
-                c_stat.PVB_C(i, j, indx) = alpha;
-
-                //udate Vt
-                c_stat.V_C<OT>(i, j, indx) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-            else
-            {
-                //update
-                c_stat.PV_C(i, j, indx) += alpha;
-
-                if (!foreground(i, j))
-                {
-                    c_stat.PVB_C(i, j, indx) += alpha;
-                }
-            }
-
-            //re-sort Ct table by Pv
-            const float PV_C_indx = c_stat.PV_C(i, j, indx);
-            const float PVB_C_indx = c_stat.PVB_C(i, j, indx);
-            OT V_C_indx = c_stat.V_C<OT>(i, j, indx);
-            for (int k = 0; k < indx; ++k)
-            {
-                if (c_stat.PV_C(i, j, k) <= PV_C_indx)
-                {
-                    //shift elements
-                    float Pv_tmp1;
-                    float Pv_tmp2 = PV_C_indx;
-
-                    float Pvb_tmp1;
-                    float Pvb_tmp2 = PVB_C_indx;
-
-                    OT v_tmp1;
-                    OT v_tmp2 = V_C_indx;
-
-                    for (int l = k; l <= indx; ++l)
-                    {
-                        Pv_tmp1 = c_stat.PV_C(i, j, l);
-                        c_stat.PV_C(i, j, l) = Pv_tmp2;
-                        Pv_tmp2 = Pv_tmp1;
-
-                        Pvb_tmp1 = c_stat.PVB_C(i, j, l);
-                        c_stat.PVB_C(i, j, l) = Pvb_tmp2;
-                        Pvb_tmp2 = Pvb_tmp1;
-
-                        v_tmp1 = c_stat.V_C<OT>(i, j, l);
-                        c_stat.V_C<OT>(i, j, l) = v_tmp2;
-                        v_tmp2 = v_tmp1;
-                    }
-
-                    break;
-                }
-            }
-
-            // Check "once-off" changes:
-            float sum1 = 0.0f;
-            float sum2 = 0.0f;
-            for (int k = 0; k < N1c; ++k)
-            {
-                const float PV_C = c_stat.PV_C(i, j, k);
-                if (!PV_C)
-                    break;
-
-                sum1 += PV_C;
-                sum2 += c_stat.PVB_C(i, j, k);
-            }
-
-            if (sum1 > T)
-                c_stat.is_trained_st_model(i, j) = 1;
-
-            float diff = sum1 - Pbc * sum2;
-
-            // Update stat table:
-            if (diff > T)
-            {
-                //new BG features are discovered
-                for (int k = 0; k < N1c; ++k)
-                {
-                    const float PV_C = c_stat.PV_C(i, j, k);
-                    if (!PV_C)
-                        break;
-
-                    c_stat.PVB_C(i, j, k) = (PV_C - Pbc * c_stat.PVB_C(i, j, k)) / (1.0f - Pbc);
-                }
-
-                c_stat.Pbc(i, j) = 1.0f - Pbc;
-            }
-            else
-            {
-                c_stat.Pbc(i, j) = Pbc;
-            }
-        } // if !(change detection) at pixel (i,j)
-
-        // Update the reference BG image:
-        if (!foreground(i, j))
-        {
-            CT curVal = curFrame(i, j);
-
-            if (!Ftd(i, j) && !Fbd(i, j))
-            {
-                // Apply IIR filter:
-                OT oldVal = background(i, j);
-
-                int3 newVal = make_int3(
-                    __float2int_rn(oldVal.x * (1.0f - alpha1) + curVal.x * alpha1),
-                    __float2int_rn(oldVal.y * (1.0f - alpha1) + curVal.y * alpha1),
-                    __float2int_rn(oldVal.z * (1.0f - alpha1) + curVal.z * alpha1)
-                );
-
-                background(i, j) = Output<OT>::make(
-                    static_cast<uchar>(newVal.x),
-                    static_cast<uchar>(newVal.y),
-                    static_cast<uchar>(newVal.z)
-                );
-            }
-            else
-            {
-                background(i, j) = Output<OT>::make(curVal.x, curVal.y, curVal.z);
-            }
-        }
-    }
-
-    template <typename PT, typename CT, typename OT>
-    struct UpdateBackgroundModel
-    {
-        static void call(PtrStepSz<PT> prevFrame, PtrStepSz<CT> curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSz<OT> background,
-                         int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                         cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(prevFrame.cols, block.x), divUp(prevFrame.rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb>, cudaFuncCachePreferL1) );
-
-            updateBackgroundModel<PT, CT, OT, PtrStep<PT>, PtrStep<CT>, PtrStepb, PtrStepb><<<grid, block, 0, stream>>>(
-                prevFrame.cols, prevFrame.rows,
-                prevFrame, curFrame,
-                Ftd, Fbd, foreground, background,
-                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background,
-                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                                   cudaStream_t stream)
-    {
-        UpdateBackgroundModel<PT, CT, OT>::call(PtrStepSz<PT>(prevFrame), PtrStepSz<CT>(curFrame), Ftd, Fbd, foreground, PtrStepSz<OT>(background),
-                                                deltaC, deltaCC, alpha1, alpha2, alpha3, N1c, N1cc, N2c, N2cc, T, stream);
-    }
-
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar3, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar3, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar3>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-    template void updateBackgroundModel_gpu<uchar4, uchar4, uchar4>(PtrStepSzb prevFrame, PtrStepSzb curFrame, PtrStepSzb Ftd, PtrStepSzb Fbd, PtrStepSzb foreground, PtrStepSzb background, int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
@@ -1,189 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __FGD_BGFG_COMMON_HPP__
-#define __FGD_BGFG_COMMON_HPP__
-
-#include "opencv2/core/cuda_devptrs.hpp"
-
-namespace bgfg
-{
-    struct BGPixelStat
-    {
-    public:
-#ifdef __CUDACC__
-        __device__ float& Pbc(int i, int j);
-        __device__ float& Pbcc(int i, int j);
-
-        __device__ unsigned char& is_trained_st_model(int i, int j);
-        __device__ unsigned char& is_trained_dyn_model(int i, int j);
-
-        __device__ float& PV_C(int i, int j, int k);
-        __device__ float& PVB_C(int i, int j, int k);
-        template <typename T> __device__ T& V_C(int i, int j, int k);
-
-        __device__ float& PV_CC(int i, int j, int k);
-        __device__ float& PVB_CC(int i, int j, int k);
-        template <typename T> __device__ T& V1_CC(int i, int j, int k);
-        template <typename T> __device__ T& V2_CC(int i, int j, int k);
-#endif
-
-        int rows_;
-
-        unsigned char* Pbc_data_;
-        size_t Pbc_step_;
-
-        unsigned char* Pbcc_data_;
-        size_t Pbcc_step_;
-
-        unsigned char* is_trained_st_model_data_;
-        size_t is_trained_st_model_step_;
-
-        unsigned char* is_trained_dyn_model_data_;
-        size_t is_trained_dyn_model_step_;
-
-        unsigned char* ctable_Pv_data_;
-        size_t ctable_Pv_step_;
-
-        unsigned char* ctable_Pvb_data_;
-        size_t ctable_Pvb_step_;
-
-        unsigned char* ctable_v_data_;
-        size_t ctable_v_step_;
-
-        unsigned char* cctable_Pv_data_;
-        size_t cctable_Pv_step_;
-
-        unsigned char* cctable_Pvb_data_;
-        size_t cctable_Pvb_step_;
-
-        unsigned char* cctable_v1_data_;
-        size_t cctable_v1_step_;
-
-        unsigned char* cctable_v2_data_;
-        size_t cctable_v2_step_;
-    };
-
-#ifdef __CUDACC__
-    __device__ __forceinline__ float& BGPixelStat::Pbc(int i, int j)
-    {
-        return *((float*)(Pbc_data_ + i * Pbc_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::Pbcc(int i, int j)
-    {
-        return *((float*)(Pbcc_data_ + i * Pbcc_step_) + j);
-    }
-
-    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_st_model(int i, int j)
-    {
-        return *((unsigned char*)(is_trained_st_model_data_ + i * is_trained_st_model_step_) + j);
-    }
-
-    __device__ __forceinline__ unsigned char& BGPixelStat::is_trained_dyn_model(int i, int j)
-    {
-        return *((unsigned char*)(is_trained_dyn_model_data_ + i * is_trained_dyn_model_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PV_C(int i, int j, int k)
-    {
-        return *((float*)(ctable_Pv_data_ + ((k * rows_) + i) * ctable_Pv_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PVB_C(int i, int j, int k)
-    {
-        return *((float*)(ctable_Pvb_data_ + ((k * rows_) + i) * ctable_Pvb_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V_C(int i, int j, int k)
-    {
-        return *((T*)(ctable_v_data_ + ((k * rows_) + i) * ctable_v_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PV_CC(int i, int j, int k)
-    {
-        return *((float*)(cctable_Pv_data_ + ((k * rows_) + i) * cctable_Pv_step_) + j);
-    }
-
-    __device__ __forceinline__ float& BGPixelStat::PVB_CC(int i, int j, int k)
-    {
-        return *((float*)(cctable_Pvb_data_ + ((k * rows_) + i) * cctable_Pvb_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V1_CC(int i, int j, int k)
-    {
-        return *((T*)(cctable_v1_data_ + ((k * rows_) + i) * cctable_v1_step_) + j);
-    }
-
-    template <typename T> __device__ __forceinline__ T& BGPixelStat::V2_CC(int i, int j, int k)
-    {
-        return *((T*)(cctable_v2_data_ + ((k * rows_) + i) * cctable_v2_step_) + j);
-    }
-#endif
-
-    const int PARTIAL_HISTOGRAM_COUNT = 240;
-    const int HISTOGRAM_BIN_COUNT = 256;
-
-    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
-                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
-                               bool cc20, cudaStream_t stream);
-
-    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
-
-    void setBGPixelStat(const BGPixelStat& stat);
-
-    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
-                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-
-    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                   cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
-                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
-                                   cudaStream_t stream);
-}
-
-#endif // __FGD_BGFG_COMMON_HPP__
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -1,143 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace gfft
-    {
-        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __device__ int g_counter = 0;
-
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
-        {
-            const int j = blockIdx.x * blockDim.x + threadIdx.x;
-            const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
-            {
-                float val = tex2D(eigTex, j, i);
-
-                if (val > threshold)
-                {
-                    float maxVal = val;
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
-
-                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
-                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
-
-                    if (val == maxVal)
-                    {
-                        const int ind = ::atomicAdd(&g_counter, 1);
-
-                        if (ind < max_count)
-                            corners[ind] = make_float2(j, i);
-                    }
-                }
-            }
-        }
-
-        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
-        {
-            void* counter_ptr;
-            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
-
-            bindTexture(&eigTex, eig);
-
-            dim3 block(16, 16);
-            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
-
-            if (mask.data)
-                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
-            else
-                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int count;
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            return std::min(count, max_count);
-        }
-
-        class EigGreater
-        {
-        public:
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
-            {
-                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
-            }
-        };
-
-
-        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
-        {
-            bindTexture(&eigTex, eig);
-
-            thrust::device_ptr<float2> ptr(corners);
-
-            thrust::sort(ptr, ptr + count, EigGreater());
-        }
-    } // namespace optical_flow
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -1,153 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace hist
-{
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
-    {
-        __shared__ int shist[256];
-
-        const int y = blockIdx.x * blockDim.y + threadIdx.y;
-        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        shist[tid] = 0;
-        __syncthreads();
-
-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-
-                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
-            }
-
-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
-            }
-        }
-
-        __syncthreads();
-
-        const int histVal = shist[tid];
-        if (histVal > 0)
-            ::atomicAdd(hist + tid, histVal);
-    }
-
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-
-namespace hist
-{
-    __constant__ int c_lut[256];
-
-    struct EqualizeHist : unary_function<uchar, uchar>
-    {
-        float scale;
-
-        __host__ EqualizeHist(float _scale) : scale(_scale) {}
-
-        __device__ __forceinline__ uchar operator ()(uchar val) const
-        {
-            const int lut = c_lut[val];
-            return __float2int_rn(scale * lut);
-        }
-    };
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
-    {
-        enum { smart_shift = 4 };
-    };
-}}}
-
-namespace hist
-{
-    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
-    {
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        const float scale = 255.0f / (src.cols * src.rows);
-
-        cudev::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -1,472 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        // Utility function to extract unsigned chars from an unsigned integer
-        __device__ uchar4 int_to_uchar4(unsigned int in)
-        {
-            uchar4 bytes;
-            bytes.x = (in & 0x000000ff) >>  0;
-            bytes.y = (in & 0x0000ff00) >>  8;
-            bytes.z = (in & 0x00ff0000) >> 16;
-            bytes.w = (in & 0xff000000) >> 24;
-            return bytes;
-        }
-
-        __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
-        {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-            __shared__ int sums[128];
-
-            const int id = threadIdx.x;
-            const int lane_id = id % warpSize;
-            const int warp_id = id / warpSize;
-
-            const uint4 data = img(blockIdx.x, id);
-
-            const uchar4 a = int_to_uchar4(data.x);
-            const uchar4 b = int_to_uchar4(data.y);
-            const uchar4 c = int_to_uchar4(data.z);
-            const uchar4 d = int_to_uchar4(data.w);
-
-            int result[16];
-
-            result[0]  =              a.x;
-            result[1]  = result[0]  + a.y;
-            result[2]  = result[1]  + a.z;
-            result[3]  = result[2]  + a.w;
-
-            result[4]  = result[3]  + b.x;
-            result[5]  = result[4]  + b.y;
-            result[6]  = result[5]  + b.z;
-            result[7]  = result[6]  + b.w;
-
-            result[8]  = result[7]  + c.x;
-            result[9]  = result[8]  + c.y;
-            result[10] = result[9]  + c.z;
-            result[11] = result[10] + c.w;
-
-            result[12] = result[11] + d.x;
-            result[13] = result[12] + d.y;
-            result[14] = result[13] + d.z;
-            result[15] = result[14] + d.w;
-
-            int sum = result[15];
-
-            // the prefix sum for each thread's 16 value is computed,
-            // now the final sums (result[15]) need to be shared
-            // with the other threads and add.  To do this,
-            // the __shfl_up() instruction is used and a shuffle scan
-            // operation is performed to distribute the sums to the correct
-            // threads
-            #pragma unroll
-            for (int i = 1; i < 32; i *= 2)
-            {
-                const int n = __shfl_up(sum, i, 32);
-
-                if (lane_id >= i)
-                {
-                    #pragma unroll
-                    for (int i = 0; i < 16; ++i)
-                        result[i] += n;
-
-                    sum += n;
-                }
-            }
-
-            // Now the final sum for the warp must be shared
-            // between warps.  This is done by each warp
-            // having a thread store to shared memory, then
-            // having some other warp load the values and
-            // compute a prefix sum, again by using __shfl_up.
-            // The results are uniformly added back to the warps.
-            // last thread in the warp holding sum of the warp
-            // places that in shared
-            if (threadIdx.x % warpSize == warpSize - 1)
-                sums[warp_id] = result[15];
-
-            __syncthreads();
-
-            if (warp_id == 0)
-            {
-                int warp_sum = sums[lane_id];
-
-                #pragma unroll
-                for (int i = 1; i <= 32; i *= 2)
-                {
-                    const int n = __shfl_up(warp_sum, i, 32);
-
-                    if (lane_id >= i)
-                        warp_sum += n;
-                }
-
-                sums[lane_id] = warp_sum;
-            }
-
-            __syncthreads();
-
-            int blockSum = 0;
-
-            // fold in unused warp
-            if (warp_id > 0)
-            {
-                blockSum = sums[warp_id - 1];
-
-                #pragma unroll
-                for (int i = 0; i < 16; ++i)
-                    result[i] += blockSum;
-            }
-
-            // assemble result
-            // Each thread has 16 values to write, which are
-            // now integer data (to avoid overflow).  Instead of
-            // each thread writing consecutive uint4s, the
-            // approach shown here experiments using
-            // the shuffle command to reformat the data
-            // inside the registers so that each thread holds
-            // consecutive data to be written so larger contiguous
-            // segments can be assembled for writing.
-
-            /*
-                For example data that needs to be written as
-
-                GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
-                but is stored in registers (r0..r3), in four threads (0..3) as:
-
-                threadId   0  1  2  3
-                  r0      x0 y0 z0 w0
-                  r1      x1 y1 z1 w1
-                  r2      x2 y2 z2 w2
-                  r3      x3 y3 z3 w3
-
-                  after apply __shfl_xor operations to move data between registers r1..r3:
-
-                threadId  00 01 10 11
-                          x0 y0 z0 w0
-                 xor(01)->y1 x1 w1 z1
-                 xor(10)->z2 w2 x2 y2
-                 xor(11)->w3 z3 y3 x3
-
-                 and now x0..x3, and z0..z3 can be written out in order by all threads.
-
-                 In the current code, each register above is actually representing
-                 four integers to be written as uint4's to GMEM.
-            */
-
-            result[4]  = __shfl_xor(result[4] , 1, 32);
-            result[5]  = __shfl_xor(result[5] , 1, 32);
-            result[6]  = __shfl_xor(result[6] , 1, 32);
-            result[7]  = __shfl_xor(result[7] , 1, 32);
-
-            result[8]  = __shfl_xor(result[8] , 2, 32);
-            result[9]  = __shfl_xor(result[9] , 2, 32);
-            result[10] = __shfl_xor(result[10], 2, 32);
-            result[11] = __shfl_xor(result[11], 2, 32);
-
-            result[12] = __shfl_xor(result[12], 3, 32);
-            result[13] = __shfl_xor(result[13], 3, 32);
-            result[14] = __shfl_xor(result[14], 3, 32);
-            result[15] = __shfl_xor(result[15], 3, 32);
-
-            uint4* integral_row = integral.ptr(blockIdx.x);
-            uint4 output;
-
-            ///////
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
-
-            ///////
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
-
-            // continuning from the above example,
-            // this use of __shfl_xor() places the y0..y3 and w0..w3 data
-            // in order.
-
-            #pragma unroll
-            for (int i = 0; i < 16; ++i)
-                result[i] = __shfl_xor(result[i], 1, 32);
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
-
-            ///////
-
-            if (threadIdx.x % 4 == 2)
-                output = make_uint4(result[0], result[1], result[2], result[3]);
-
-            if (threadIdx.x % 4 == 3)
-                output = make_uint4(result[4], result[5], result[6], result[7]);
-
-            if (threadIdx.x % 4 == 0)
-                output = make_uint4(result[8], result[9], result[10], result[11]);
-
-            if (threadIdx.x % 4 == 1)
-                output = make_uint4(result[12], result[13], result[14], result[15]);
-
-            integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
-        #endif
-        }
-
-        // This kernel computes columnwise prefix sums.  When the data input is
-        // the row sums from above, this completes the integral image.
-        // The approach here is to have each block compute a local set of sums.
-        // First , the data covered by the block is loaded into shared memory,
-        // then instead of performing a sum in shared memory using __syncthreads
-        // between stages, the data is reformatted so that the necessary sums
-        // occur inside warps and the shuffle scan operation is used.
-        // The final set of sums from the block is then propgated, with the block
-        // computing "down" the image and adding the running sum to the local
-        // block sums.
-        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
-        {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-            __shared__ unsigned int sums[32][9];
-
-            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int lane_id = tidx % 8;
-
-            if (tidx >= integral.cols)
-                return;
-
-            sums[threadIdx.x][threadIdx.y] = 0;
-            __syncthreads();
-
-            unsigned int stepSum = 0;
-
-            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
-            {
-                unsigned int* p = integral.ptr(y) + tidx;
-
-                unsigned int sum = *p;
-
-                sums[threadIdx.x][threadIdx.y] = sum;
-                __syncthreads();
-
-                // place into SMEM
-                // shfl scan reduce the SMEM, reformating so the column
-                // sums are computed in a warp
-                // then read out properly
-                const int j = threadIdx.x % 8;
-                const int k = threadIdx.x / 8 + threadIdx.y * 4;
-
-                int partial_sum = sums[k][j];
-
-                for (int i = 1; i <= 8; i *= 2)
-                {
-                    int n = __shfl_up(partial_sum, i, 32);
-
-                    if (lane_id >= i)
-                        partial_sum += n;
-                }
-
-                sums[k][j] = partial_sum;
-                __syncthreads();
-
-                if (threadIdx.y > 0)
-                    sum += sums[threadIdx.x][threadIdx.y - 1];
-
-                sum += stepSum;
-                stepSum += sums[threadIdx.x][blockDim.y - 1];
-
-                __syncthreads();
-
-                *p = sum;
-            }
-        #endif
-        }
-
-        void shfl_integral_gpu(const PtrStepSzb& img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
-        {
-            {
-                // each thread handles 16 values, use 1 block/row
-                // save, becouse step is actually can't be less 512 bytes
-                int block = integral.cols / 16;
-
-                // launch 1 block / row
-                const int grid = img.rows;
-
-                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
-
-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((const PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            {
-                const dim3 block(32, 8);
-                const dim3 grid(divUp(integral.cols, block.x), 1);
-
-                shfl_integral_vertical<<<grid, block, 0, stream>>>(integral);
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> buffer, PtrStepSz<unsigned int> integral)
-        {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
-            __shared__ unsigned int sums[32][9];
-
-            const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int lane_id = tidx % 8;
-
-            if (tidx >= integral.cols)
-                return;
-
-            sums[threadIdx.x][threadIdx.y] = 0;
-            __syncthreads();
-
-            unsigned int stepSum = 0;
-
-            for (int y = threadIdx.y; y < integral.rows; y += blockDim.y)
-            {
-                unsigned int* p = buffer.ptr(y) + tidx;
-                unsigned int* dst = integral.ptr(y + 1) + tidx + 1;
-
-                unsigned int sum = *p;
-
-                sums[threadIdx.x][threadIdx.y] = sum;
-                __syncthreads();
-
-                // place into SMEM
-                // shfl scan reduce the SMEM, reformating so the column
-                // sums are computed in a warp
-                // then read out properly
-                const int j = threadIdx.x % 8;
-                const int k = threadIdx.x / 8 + threadIdx.y * 4;
-
-                int partial_sum = sums[k][j];
-
-                for (int i = 1; i <= 8; i *= 2)
-                {
-                    int n = __shfl_up(partial_sum, i, 32);
-
-                    if (lane_id >= i)
-                        partial_sum += n;
-                }
-
-                sums[k][j] = partial_sum;
-                __syncthreads();
-
-                if (threadIdx.y > 0)
-                    sum += sums[threadIdx.x][threadIdx.y - 1];
-
-                sum += stepSum;
-                stepSum += sums[threadIdx.x][blockDim.y - 1];
-
-                __syncthreads();
-
-                *dst = sum;
-            }
-        #endif
-        }
-
-        // used for frame preprocessing before Soft Cascade evaluation: no synchronization needed
-        void shfl_integral_gpu_buffered(PtrStepSzb img, PtrStepSz<uint4> buffer, PtrStepSz<unsigned int> integral,
-            int blockStep, cudaStream_t stream)
-        {
-            {
-                const int block = blockStep;
-                const int grid = img.rows;
-
-                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
-
-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, buffer);
-                cudaSafeCall( cudaGetLastError() );
-            }
-
-            {
-                const dim3 block(32, 8);
-                const dim3 grid(divUp(integral.cols, block.x), 1);
-
-                shfl_integral_vertical<<<grid, block, 0, stream>>>((PtrStepSz<uint>)buffer, integral);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@@ -1,73 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_internal_shared_HPP__
-#define __OPENCV_internal_shared_HPP__
-
-#include <cuda_runtime.h>
-#include <npp.h>
-#include "NPP_staging.hpp"
-#include "opencv2/gpu/devmem2d.hpp"
-#include "safe_call.hpp"
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu
-{
-    class NppStStreamHandler
-    {
-    public:
-        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
-        {
-            oldStream = nppStSetActiveCUDAstream(newStream);
-        }
-
-        inline ~NppStStreamHandler()
-        {
-            nppStSetActiveCUDAstream(oldStream);
-        }
-
-    private:
-        cudaStream_t oldStream;
-    };
-}}
-
-#endif /* __OPENCV_internal_shared_HPP__ */
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@@ -1,916 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace match_template
-    {
-        __device__ __forceinline__ float sum(float v) { return v; }
-        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
-        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
-        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
-
-        __device__ __forceinline__ float first(float v) { return v; }
-        __device__ __forceinline__ float first(float2 v) { return v.x; }
-        __device__ __forceinline__ float first(float3 v) { return v.x; }
-        __device__ __forceinline__ float first(float4 v) { return v.x; }
-
-        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
-        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
-        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
-
-        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
-        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
-        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
-
-        //////////////////////////////////////////////////////////////////////
-        // Naive_CCORR
-
-        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
-        {
-            typedef typename TypeVec<T, cn>::vec_type Type;
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef res = VecTraits<Typef>::all(0);
-
-                for (int i = 0; i < h; ++i)
-                {
-                    const Type* image_ptr = (const Type*)image.ptr(y + i);
-                    const Type* templ_ptr = (const Type*)templ.ptr(i);
-                    for (int j = 0; j < w; ++j)
-                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
-                }
-
-                result.ptr(y)[x] = sum(res);
-            }
-        }
-
-        template <typename T, int cn>
-        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-
-        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Naive_SQDIFF
-
-        template <typename T, int cn>
-        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
-        {
-            typedef typename TypeVec<T, cn>::vec_type Type;
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef res = VecTraits<Typef>::all(0);
-                Typef delta;
-
-                for (int i = 0; i < h; ++i)
-                {
-                    const Type* image_ptr = (const Type*)image.ptr(y + i);
-                    const Type* templ_ptr = (const Type*)templ.ptr(i);
-                    for (int j = 0; j < w; ++j)
-                    {
-                        delta = sub(image_ptr[x + j], templ_ptr[j]);
-                        res = res + delta * delta;
-                    }
-                }
-
-                result.ptr(y)[x] = sum(res);
-            }
-        }
-
-        template <typename T, int cn>
-        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
-            };
-
-            callers[cn](image, templ, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_SQDIFF
-
-        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
-            }
-        }
-
-        template <int cn>
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
-                                             cudaStream_t stream)
-        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
-
-            static const caller_t callers[] =
-            {
-                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
-            };
-
-            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_SQDIFF_NORMED
-
-        // normAcc* are accurate normalization routines which make GPU matchTemplate
-        // consistent with CPU one
-
-        __device__ float normAcc(float num, float denum)
-        {
-            if (::fabs(num) < denum)
-                return num / denum;
-            if (::fabs(num) < denum * 1.125f)
-                return num > 0 ? 1 : -1;
-            return 0;
-        }
-
-
-        __device__ float normAcc_SQDIFF(float num, float denum)
-        {
-            if (::fabs(num) < denum)
-                return num / denum;
-            if (::fabs(num) < denum * 1.125f)
-                return num > 0 ? 1 : -1;
-            return 1;
-        }
-
-
-        template <int cn>
-        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
-                                                  sqrtf(image_sqsum_ * templ_sqsum));
-            }
-        }
-
-        template <int cn>
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    PtrStepSzf result, cudaStream_t stream)
-        {
-            const dim3 threads(32, 8);
-            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
-                                                    PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
-            static const caller_t callers[] =
-            {
-                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
-            };
-
-            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_CCOFF
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_ = (float)(
-                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
-                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC2(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                unsigned int templ_sum_r, unsigned int templ_sum_g,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
-                    image_sum_r, image_sum_g, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
-                int w, int h,
-                float templ_sum_scale_r,
-                float templ_sum_scale_g,
-                float templ_sum_scale_b,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g
-                                         - image_sum_b_ * templ_sum_scale_b;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC3(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h,
-                    (float)templ_sum_r / (w * h),
-                    (float)templ_sum_g / (w * h),
-                    (float)templ_sum_b / (w * h),
-                    image_sum_r, image_sum_g, image_sum_b, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
-                int w, int h,
-                float templ_sum_scale_r,
-                float templ_sum_scale_g,
-                float templ_sum_scale_b,
-                float templ_sum_scale_a,
-                const PtrStep<unsigned int> image_sum_r,
-                const PtrStep<unsigned int> image_sum_g,
-                const PtrStep<unsigned int> image_sum_b,
-                const PtrStep<unsigned int> image_sum_a,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sum_a_ = (float)(
-                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-                float ccorr = result.ptr(y)[x];
-                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
-                                         - image_sum_g_ * templ_sum_scale_g
-                                         - image_sum_b_ * templ_sum_scale_b
-                                         - image_sum_a_ * templ_sum_scale_a;
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_8UC4(
-                int w, int h,
-                const PtrStepSz<unsigned int> image_sum_r,
-                const PtrStepSz<unsigned int> image_sum_g,
-                const PtrStepSz<unsigned int> image_sum_b,
-                const PtrStepSz<unsigned int> image_sum_a,
-                unsigned int templ_sum_r,
-                unsigned int templ_sum_g,
-                unsigned int templ_sum_b,
-                unsigned int templ_sum_a,
-                PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h,
-                    (float)templ_sum_r / (w * h),
-                    (float)templ_sum_g / (w * h),
-                    (float)templ_sum_b / (w * h),
-                    (float)templ_sum_a / (w * h),
-                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // Prepared_CCOFF_NORMED
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
-                int w, int h, float weight,
-                float templ_sum_scale, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum,
-                const PtrStep<unsigned long long> image_sqsum,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float ccorr = result.ptr(y)[x];
-                float image_sum_ = (float)(
-                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
-                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
-                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
-                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
-                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8U(
-                    int w, int h, const PtrStepSz<unsigned int> image_sum,
-                    const PtrStepSz<unsigned long long> image_sqsum,
-                    unsigned int templ_sum, unsigned long long templ_sqsum,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale = templ_sum * weight;
-            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
-                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
-                    image_sum, image_sqsum, result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g,
-                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                             - image_sum_g_ * templ_sum_scale_g;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
-                float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sqsum_b_ = (float)(
-                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
-                                             - image_sum_g_ * templ_sum_scale_g
-                                             - image_sum_b_ * templ_sum_scale_b;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    image_sum_b, image_sqsum_b,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-
-        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
-                int w, int h, float weight,
-                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
-                float templ_sum_scale_a, float templ_sqsum_scale,
-                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
-                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
-                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
-                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
-                PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sum_r_ = (float)(
-                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
-                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
-                float image_sqsum_r_ = (float)(
-                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
-                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
-                float image_sum_g_ = (float)(
-                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
-                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
-                float image_sqsum_g_ = (float)(
-                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
-                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
-                float image_sum_b_ = (float)(
-                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
-                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
-                float image_sqsum_b_ = (float)(
-                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
-                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
-                float image_sum_a_ = (float)(
-                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
-                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
-                float image_sqsum_a_ = (float)(
-                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
-                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
-
-                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
-                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
-                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
-                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
-                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
-                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
-                result.ptr(y)[x] = normAcc(num, denum);
-            }
-        }
-
-        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
-                    int w, int h,
-                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
-                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
-                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
-                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
-                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
-                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
-                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
-                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
-                    PtrStepSzf result, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            float weight = 1.f / (w * h);
-            float templ_sum_scale_r = templ_sum_r * weight;
-            float templ_sum_scale_g = templ_sum_g * weight;
-            float templ_sum_scale_b = templ_sum_b * weight;
-            float templ_sum_scale_a = templ_sum_a * weight;
-            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
-                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
-                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
-                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
-
-            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
-                    w, h, weight,
-                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
-                    templ_sqsum_scale,
-                    image_sum_r, image_sqsum_r,
-                    image_sum_g, image_sqsum_g,
-                    image_sum_b, image_sqsum_b,
-                    image_sum_a, image_sqsum_a,
-                    result);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // normalize
-
-        template <int cn>
-        __global__ void normalizeKernel_8U(
-                int w, int h, const PtrStep<unsigned long long> image_sqsum,
-                unsigned long long templ_sqsum, PtrStepSzf result)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                float image_sqsum_ = (float)(
-                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
-                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
-                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
-            }
-        }
-
-        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
-                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            switch (cn)
-            {
-            case 1:
-                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 2:
-                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 3:
-                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            case 4:
-                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        //////////////////////////////////////////////////////////////////////
-        // extractFirstChannel
-
-        template <int cn>
-        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
-        {
-            typedef typename TypeVec<float, cn>::vec_type Typef;
-
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < result.cols && y < result.rows)
-            {
-                Typef val = ((const Typef*)image.ptr(y))[x];
-                result.ptr(y)[x] = first(val);
-            }
-        }
-
-        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
-        {
-            dim3 threads(32, 8);
-            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
-
-            switch (cn)
-            {
-            case 1:
-                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 2:
-                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 3:
-                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            case 4:
-                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
-                break;
-            }
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } //namespace match_template
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -1,217 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace mathfunc
-    {
-        //////////////////////////////////////////////////////////////////////////////////////
-        // Cart <-> Polar
-
-        struct Nothing
-        {
-            static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
-            {
-            }
-        };
-        struct Magnitude
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
-            }
-        };
-        struct MagnitudeSqr
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-            {
-                dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-            }
-        };
-        struct Atan2
-        {
-            static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-            {
-                float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0f * CV_PI_F;
-                dst[y * dst_step + x] = scale * angle;
-            }
-        };
-        template <typename Mag, typename Angle>
-        __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step,
-                                    float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float x_data = xptr[y * x_step + x];
-                float y_data = yptr[y * y_step + x];
-
-                Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-                Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-            }
-        }
-
-        struct NonEmptyMag
-        {
-            static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
-            {
-                return mag[y * mag_step + x];
-            }
-        };
-        struct EmptyMag
-        {
-            static __device__ __forceinline__ float get(const float*, size_t, int, int)
-            {
-                return 1.0f;
-            }
-        };
-        template <typename Mag>
-        __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-            float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < width && y < height)
-            {
-                float mag_data = Mag::get(mag, mag_step, x, y);
-                float angle_data = angle[y * angle_step + x];
-                float sin_a, cos_a;
-
-                ::sincosf(scale * angle_data, &sin_a, &cos_a);
-
-                xptr[y * x_step + x] = mag_data * cos_a;
-                yptr[y * y_step + x] = mag_data * sin_a;
-            }
-        }
-
-        template <typename Mag, typename Angle>
-        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(x.cols, threads.x);
-            grid.y = divUp(x.rows, threads.y);
-
-            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
-
-            cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-                x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
-                mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2][2][2] =
-            {
-                {
-                    {
-                        cartToPolar_caller<Magnitude, Atan2>,
-                        cartToPolar_caller<Magnitude, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<MagnitudeSqr, Atan2>,
-                        cartToPolar_caller<MagnitudeSqr, Nothing>,
-                    }
-                },
-                {
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>
-                    },
-                    {
-                        cartToPolar_caller<Nothing, Atan2>,
-                        cartToPolar_caller<Nothing, Nothing>,
-                    }
-                }
-            };
-
-            callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-        }
-
-        template <typename Mag>
-        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(mag.cols, threads.x);
-            grid.y = divUp(mag.rows, threads.y);
-
-            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
-
-            polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
-                angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
-            static const caller_t callers[2] =
-            {
-                polarToCart_caller<NonEmptyMag>,
-                polarToCart_caller<EmptyMag>
-            };
-
-            callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-        }
-    } // namespace mathfunc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/nlm.cu
+++ b/modules/gpu/src/cuda/nlm.cu
@@ -1,569 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __device__ __forceinline__ float norm2(const float& v) { return v*v; }
-        __device__ __forceinline__ float norm2(const float2& v) { return v.x*v.x + v.y*v.y; }
-        __device__ __forceinline__ float norm2(const float3& v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
-        __device__ __forceinline__ float norm2(const float4& v) { return v.x*v.x + v.y*v.y + v.z*v.z  + v.w*v.w; }
-
-        template<typename T, typename B>
-        __global__ void nlm_kernel(const PtrStep<T> src, PtrStepSz<T> dst, const B b, int search_radius, int block_radius, float noise_mult)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-            const int i = blockDim.y * blockIdx.y + threadIdx.y;
-            const int j = blockDim.x * blockIdx.x + threadIdx.x;
-
-            if (j >= dst.cols || i >= dst.rows)
-                return;
-
-            int bsize = search_radius + block_radius;
-            int search_window = 2 * search_radius + 1;
-            float minus_search_window2_inv = -1.f/(search_window * search_window);
-
-            value_type sum1 = VecTraits<value_type>::all(0);
-            float sum2 = 0.f;
-
-            if (j - bsize >= 0 && j + bsize < dst.cols && i - bsize >= 0 && i + bsize < dst.rows)
-            {
-                for(float y = -search_radius; y <= search_radius; ++y)
-                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {
-                        float dist2 = 0;
-                        for(float ty = -block_radius; ty <= block_radius; ++ty)
-                            for(float tx = -block_radius; tx <= block_radius; ++tx)
-                            {
-                                value_type bv = saturate_cast<value_type>(src(i + y + ty, j + x + tx));
-                                value_type av = saturate_cast<value_type>(src(i +     ty, j +     tx));
-
-                                dist2 += norm2(av - bv);
-                            }
-
-                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
-
-                        /*if (i == 255 && j == 255)
-                            printf("%f %f\n", w, dist2 * minus_h2_inv + (x * x + y * y) * minus_search_window2_inv);*/
-
-                        sum1 = sum1 + w * saturate_cast<value_type>(src(i + y, j + x));
-                        sum2 += w;
-                    }
-            }
-            else
-            {
-                for(float y = -search_radius; y <= search_radius; ++y)
-                    for(float x = -search_radius; x <= search_radius; ++x)
-                    {
-                        float dist2 = 0;
-                        for(float ty = -block_radius; ty <= block_radius; ++ty)
-                            for(float tx = -block_radius; tx <= block_radius; ++tx)
-                            {
-                                value_type bv = saturate_cast<value_type>(b.at(i + y + ty, j + x + tx, src));
-                                value_type av = saturate_cast<value_type>(b.at(i +     ty, j +     tx, src));
-                                dist2 += norm2(av - bv);
-                            }
-
-                        float w = __expf(dist2 * noise_mult + (x * x + y * y) * minus_search_window2_inv);
-
-                        sum1 = sum1 + w * saturate_cast<value_type>(b.at(i + y, j + x, src));
-                        sum2 += w;
-                    }
-
-            }
-
-            dst(i, j) = saturate_cast<T>(sum1 / sum2);
-
-        }
-
-        template<typename T, template <typename> class B>
-        void nlm_caller(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream)
-        {
-            dim3 block (32, 8);
-            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
-
-            B<T> b(src.rows, src.cols);
-
-            int block_window = 2 * block_radius + 1;
-            float minus_h2_inv = -1.f/(h * h * VecTraits<T>::cn);
-            float noise_mult = minus_h2_inv/(block_window * block_window);
-
-            cudaSafeCall( cudaFuncSetCacheConfig (nlm_kernel<T, B<T> >, cudaFuncCachePreferL1) );
-            nlm_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, search_radius, block_radius, noise_mult);
-            cudaSafeCall ( cudaGetLastError () );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template<typename T>
-        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream)
-        {
-            typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb dst, int search_radius, int block_radius, float h, cudaStream_t stream);
-
-            static func_t funcs[] =
-            {
-                nlm_caller<T, BrdReflect101>,
-                nlm_caller<T, BrdReplicate>,
-                nlm_caller<T, BrdConstant>,
-                nlm_caller<T, BrdReflect>,
-                nlm_caller<T, BrdWrap>,
-            };
-            funcs[borderMode](src, dst, search_radius, block_radius, h, stream);
-        }
-
-        template void nlm_bruteforce_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-        template void nlm_bruteforce_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-        template void nlm_bruteforce_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, int, int, float, int, cudaStream_t);
-    }
-}}}
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (fast approximate version)
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-
-        template <int cn> struct Unroll;
-        template <> struct Unroll<1>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&> tie(float& val1, float& val2)
-            {
-                return thrust::tie(val1, val2);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op);
-            }
-        };
-        template <> struct Unroll<2>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&> tie(float& val1, float2& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op);
-            }
-        };
-        template <> struct Unroll<3>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&> tie(float& val1, float3& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y, val2.z);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op, op);
-            }
-        };
-        template <> struct Unroll<4>
-        {
-            template <int BLOCK_SIZE>
-            static __device__ __forceinline__ thrust::tuple<volatile float*, volatile float*, volatile float*, volatile float*, volatile float*> smem_tuple(float* smem)
-            {
-                return cv::gpu::cudev::smem_tuple(smem, smem + BLOCK_SIZE, smem + 2 * BLOCK_SIZE, smem + 3 * BLOCK_SIZE, smem + 4 * BLOCK_SIZE);
-            }
-
-            static __device__ __forceinline__ thrust::tuple<float&, float&, float&, float&, float&> tie(float& val1, float4& val2)
-            {
-                return thrust::tie(val1, val2.x, val2.y, val2.z, val2.w);
-            }
-
-            static __device__ __forceinline__ const thrust::tuple<plus<float>, plus<float>, plus<float>, plus<float>, plus<float> > op()
-            {
-                plus<float> op;
-                return thrust::make_tuple(op, op, op, op, op);
-            }
-        };
-
-        __device__ __forceinline__ int calcDist(const uchar&  a, const uchar&  b) { return (a-b)*(a-b); }
-        __device__ __forceinline__ int calcDist(const uchar2& a, const uchar2& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y); }
-        __device__ __forceinline__ int calcDist(const uchar3& a, const uchar3& b) { return (a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y) + (a.z-b.z)*(a.z-b.z); }
-
-        template <class T> struct FastNonLocalMenas
-        {
-            enum
-            {
-                CTA_SIZE = 128,
-
-                TILE_COLS = 128,
-                TILE_ROWS = 32,
-
-                STRIDE = CTA_SIZE
-            };
-
-            struct plus
-            {
-                __device__ __forceinline__ float operator()(float v1, float v2) const { return v1 + v2; }
-            };
-
-            int search_radius;
-            int block_radius;
-
-            int search_window;
-            int block_window;
-            float minus_h2_inv;
-
-            FastNonLocalMenas(int search_window_, int block_window_, float h) : search_radius(search_window_/2), block_radius(block_window_/2),
-                search_window(search_window_), block_window(block_window_), minus_h2_inv(-1.f/(h * h * VecTraits<T>::cn)) {}
-
-            PtrStep<T> src;
-            mutable PtrStepi buffer;
-
-            __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    dist_sums[index] = 0;
-
-                    for(int tx = 0; tx < block_window; ++tx)
-                        col_sums(tx, index) = 0;
-
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int ay = i;
-                    int ax = j;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius;
-
-#if 1
-                    for (int tx = -block_radius; tx <= block_radius; ++tx)
-                    {
-                        int col_sum = 0;
-                        for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        {
-                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
-
-                            dist_sums[index] += dist;
-                            col_sum += dist;
-                        }
-                        col_sums(tx + block_radius, index) = col_sum;
-                    }
-#else
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        for (int tx = -block_radius; tx <= block_radius; ++tx)
-                        {
-                            int dist = calcDist(src(ay + ty, ax + tx), src(by + ty, bx + tx));
-
-                            dist_sums[index] += dist;
-                            col_sums(tx + block_radius, index) += dist;
-                        }
-#endif
-
-                    up_col_sums(j, index) = col_sums(block_window - 1, index);
-                }
-            }
-
-            __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int ay = i;
-                    int ax = j + block_radius;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius + block_radius;
-
-                    int col_sum = 0;
-
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                        col_sum += calcDist(src(ay + ty, ax), src(by + ty, bx));
-
-                    dist_sums[index] += col_sum - col_sums(first, index);
-
-                    col_sums(first, index) = col_sum;
-                    up_col_sums(j, index) = col_sum;
-                }
-            }
-
-            __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-            {
-                int ay = i;
-                int ax = j + block_radius;
-
-                T a_up   = src(ay - block_radius - 1, ax);
-                T a_down = src(ay + block_radius, ax);
-
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    int by = i + y - search_radius;
-                    int bx = j + x - search_radius + block_radius;
-
-                    T b_up   = src(by - block_radius - 1, bx);
-                    T b_down = src(by + block_radius, bx);
-
-                    int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-
-                    dist_sums[index] += col_sum  - col_sums(first, index);
-                    col_sums(first, index) = col_sum;
-                    up_col_sums(j, index) = col_sum;
-                }
-            }
-
-            __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums, T& dst) const
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_type;
-
-                float weights_sum = 0;
-                sum_type sum = VecTraits<sum_type>::all(0);
-
-                float bw2_inv = 1.f/(block_window * block_window);
-
-                int sx = j - search_radius;
-                int sy = i - search_radius;
-
-                for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-                {
-                    int y = index / search_window;
-                    int x = index - y * search_window;
-
-                    float avg_dist = dist_sums[index] * bw2_inv;
-                    float weight = __expf(avg_dist * minus_h2_inv);
-                    weights_sum += weight;
-
-                    sum = sum + weight * saturate_cast<sum_type>(src(sy + y, sx + x));
-                }
-
-                __shared__ float cta_buffer[CTA_SIZE * (VecTraits<T>::cn + 1)];
-
-                reduce<CTA_SIZE>(Unroll<VecTraits<T>::cn>::template smem_tuple<CTA_SIZE>(cta_buffer),
-                                 Unroll<VecTraits<T>::cn>::tie(weights_sum, sum),
-                                 threadIdx.x,
-                                 Unroll<VecTraits<T>::cn>::op());
-
-                if (threadIdx.x == 0)
-                    dst = saturate_cast<T>(sum / weights_sum);
-            }
-
-            __device__ __forceinline__ void operator()(PtrStepSz<T>& dst) const
-            {
-                int tbx = blockIdx.x * TILE_COLS;
-                int tby = blockIdx.y * TILE_ROWS;
-
-                int tex = ::min(tbx + TILE_COLS, dst.cols);
-                int tey = ::min(tby + TILE_ROWS, dst.rows);
-
-                PtrStepi col_sums;
-                col_sums.data = buffer.ptr(dst.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
-                col_sums.step = buffer.step;
-
-                PtrStepi up_col_sums;
-                up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
-                up_col_sums.step = buffer.step;
-
-                extern __shared__ int dist_sums[]; //search_window * search_window
-
-                int first = 0;
-
-                for (int i = tby; i < tey; ++i)
-                    for (int j = tbx; j < tex; ++j)
-                    {
-                        __syncthreads();
-
-                        if (j == tbx)
-                        {
-                            initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
-                            first = 0;
-                        }
-                        else
-                        {
-                            if (i == tby)
-                              shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
-                            else
-                              shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
-
-                            first = (first + 1) % block_window;
-                        }
-
-                        __syncthreads();
-
-                        convolve_window(i, j, dist_sums, col_sums, up_col_sums, dst(i, j));
-                    }
-            }
-
-        };
-
-        template<typename T>
-        __global__ void fast_nlm_kernel(const FastNonLocalMenas<T> fnlm, PtrStepSz<T> dst) { fnlm(dst); }
-
-        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-        {
-            typedef FastNonLocalMenas<uchar> FNLM;
-            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
-
-            buffer_cols = search_window * search_window * grid.y;
-            buffer_rows = src.cols + block_window * grid.x;
-        }
-
-        template<typename T>
-        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
-                          int search_window, int block_window, float h, cudaStream_t stream)
-        {
-            typedef FastNonLocalMenas<T> FNLM;
-            FNLM fnlm(search_window, block_window, h);
-
-            fnlm.src = (PtrStepSz<T>)src;
-            fnlm.buffer = buffer;
-
-            dim3 block(FNLM::CTA_SIZE, 1);
-            dim3 grid(divUp(src.cols, FNLM::TILE_COLS), divUp(src.rows, FNLM::TILE_ROWS));
-            int smem = search_window * search_window * sizeof(int);
-
-
-            fast_nlm_kernel<<<grid, block, smem>>>(fnlm, (PtrStepSz<T>)dst);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void nlm_fast_gpu<uchar>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float,  cudaStream_t);
-        template void nlm_fast_gpu<uchar2>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-        template void nlm_fast_gpu<uchar3>(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-
-
-
-        __global__ void fnlm_split_kernel(const PtrStepSz<uchar3> lab, PtrStepb l, PtrStep<uchar2> ab)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x < lab.cols && y < lab.rows)
-            {
-                uchar3 p = lab(y, x);
-                ab(y,x) = make_uchar2(p.y, p.z);
-                l(y,x) = p.x;
-            }
-        }
-
-        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream)
-        {
-            dim3 b(32, 8);
-            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
-
-            fnlm_split_kernel<<<g, b>>>(lab, l, ab);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void fnlm_merge_kernel(const PtrStepb l, const PtrStep<uchar2> ab, PtrStepSz<uchar3> lab)
-        {
-            int x = threadIdx.x + blockIdx.x * blockDim.x;
-            int y = threadIdx.y + blockIdx.y * blockDim.y;
-
-            if (x < lab.cols && y < lab.rows)
-            {
-                uchar2 p = ab(y, x);
-                lab(y, x) = make_uchar3(l(y, x), p.x, p.y);
-            }
-        }
-
-        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream)
-        {
-            dim3 b(32, 8);
-            dim3 g(divUp(lab.cols, b.x), divUp(lab.rows, b.y));
-
-            fnlm_merge_kernel<<<g, b>>>(l, ab, lab);
-            cudaSafeCall ( cudaGetLastError () );
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/optflowbm.cu
+++ b/modules/gpu/src/cuda/optflowbm.cu
@@ -1,414 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace optflowbm
-{
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
-    {
-        int s = 0;
-
-        for (int y = 0; y < blockSize.y; ++y)
-        {
-            for (int x = 0; x < blockSize.x; ++x)
-                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
-        }
-
-        return s;
-    }
-
-    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
-                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
-                                  const short2* ss, const int ssCount)
-    {
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (i >= velx.rows || j >= velx.cols)
-            return;
-
-        const int X1 = j * shiftSize.x;
-        const int Y1 = i * shiftSize.y;
-
-        const int offX = usePrevious ? __float2int_rn(velx(i, j)) : 0;
-        const int offY = usePrevious ? __float2int_rn(vely(i, j)) : 0;
-
-        int X2 = X1 + offX;
-        int Y2 = Y1 + offY;
-
-        int dist = numeric_limits<int>::max();
-
-        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
-
-        int countMin = 1;
-        int sumx = offX;
-        int sumy = offY;
-
-        if (dist > acceptLevel)
-        {
-            // do brute-force search
-            for (int k = 0; k < ssCount; ++k)
-            {
-                const short2 ssVal = ss[k];
-
-                const int dx = offX + ssVal.x;
-                const int dy = offY + ssVal.y;
-
-                X2 = X1 + dx;
-                Y2 = Y1 + dy;
-
-                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-                {
-                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
-                    if (tmpDist < acceptLevel)
-                    {
-                        sumx = dx;
-                        sumy = dy;
-                        countMin = 1;
-                        break;
-                    }
-
-                    if (tmpDist < dist)
-                    {
-                        dist = tmpDist;
-                        sumx = dx;
-                        sumy = dy;
-                        countMin = 1;
-                    }
-                    else if (tmpDist == dist)
-                    {
-                        sumx += dx;
-                        sumy += dy;
-                        countMin++;
-                    }
-                }
-            }
-
-            if (dist > escapeLevel)
-            {
-                sumx = offX;
-                sumy = offY;
-                countMin = 1;
-            }
-        }
-
-        velx(i, j) = static_cast<float>(sumx) / countMin;
-        vely(i, j) = static_cast<float>(sumy) / countMin;
-    }
-
-    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
-              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
-    {
-        bindTexture(&tex_prev, prev);
-        bindTexture(&tex_curr, curr);
-
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
-
-        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
-                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-/////////////////////////////////////////////////////////
-// Fast approximate version
-
-namespace optflowbm_fast
-{
-    enum
-    {
-        CTA_SIZE = 128,
-
-        TILE_COLS = 128,
-        TILE_ROWS = 32,
-
-        STRIDE = CTA_SIZE
-    };
-
-    template <typename T> __device__ __forceinline__ int calcDist(T a, T b)
-    {
-        return ::abs(a - b);
-    }
-
-    template <class T> struct FastOptFlowBM
-    {
-
-        int search_radius;
-        int block_radius;
-
-        int search_window;
-        int block_window;
-
-        PtrStepSz<T> I0;
-        PtrStep<T> I1;
-
-        mutable PtrStepi buffer;
-
-        FastOptFlowBM(int search_window_, int block_window_,
-                      PtrStepSz<T> I0_, PtrStepSz<T> I1_,
-                      PtrStepi buffer_) :
-            search_radius(search_window_ / 2), block_radius(block_window_ / 2),
-            search_window(search_window_), block_window(block_window_),
-            I0(I0_), I1(I1_),
-            buffer(buffer_)
-        {
-        }
-
-        __device__ __forceinline__ void initSums_BruteForce(int i, int j, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                dist_sums[index] = 0;
-
-                for (int tx = 0; tx < block_window; ++tx)
-                    col_sums(tx, index) = 0;
-
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int ay = i;
-                int ax = j;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius;
-
-                for (int tx = -block_radius; tx <= block_radius; ++tx)
-                {
-                    int col_sum = 0;
-                    for (int ty = -block_radius; ty <= block_radius; ++ty)
-                    {
-                        int dist = calcDist(I0(ay + ty, ax + tx), I1(by + ty, bx + tx));
-
-                        dist_sums[index] += dist;
-                        col_sum += dist;
-                    }
-
-                    col_sums(tx + block_radius, index) = col_sum;
-                }
-
-                up_col_sums(j, index) = col_sums(block_window - 1, index);
-            }
-        }
-
-        __device__ __forceinline__ void shiftRight_FirstRow(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int ay = i;
-                int ax = j + block_radius;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius + block_radius;
-
-                int col_sum = 0;
-
-                for (int ty = -block_radius; ty <= block_radius; ++ty)
-                    col_sum += calcDist(I0(ay + ty, ax), I1(by + ty, bx));
-
-                dist_sums[index] += col_sum - col_sums(first, index);
-
-                col_sums(first, index) = col_sum;
-                up_col_sums(j, index) = col_sum;
-            }
-        }
-
-        __device__ __forceinline__ void shiftRight_UpSums(int i, int j, int first, int* dist_sums, PtrStepi& col_sums, PtrStepi& up_col_sums) const
-        {
-            int ay = i;
-            int ax = j + block_radius;
-
-            T a_up   = I0(ay - block_radius - 1, ax);
-            T a_down = I0(ay + block_radius, ax);
-
-            for(int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int y = index / search_window;
-                int x = index - y * search_window;
-
-                int by = i + y - search_radius;
-                int bx = j + x - search_radius + block_radius;
-
-                T b_up   = I1(by - block_radius - 1, bx);
-                T b_down = I1(by + block_radius, bx);
-
-                int col_sum = up_col_sums(j, index) + calcDist(a_down, b_down) - calcDist(a_up, b_up);
-
-                dist_sums[index] += col_sum  - col_sums(first, index);
-                col_sums(first, index) = col_sum;
-                up_col_sums(j, index) = col_sum;
-            }
-        }
-
-        __device__ __forceinline__ void convolve_window(int i, int j, const int* dist_sums, float& velx, float& vely) const
-        {
-            int bestDist = numeric_limits<int>::max();
-            int bestInd = -1;
-
-            for (int index = threadIdx.x; index < search_window * search_window; index += STRIDE)
-            {
-                int curDist = dist_sums[index];
-                if (curDist < bestDist)
-                {
-                    bestDist = curDist;
-                    bestInd = index;
-                }
-            }
-
-            __shared__ int cta_dist_buffer[CTA_SIZE];
-            __shared__ int cta_ind_buffer[CTA_SIZE];
-
-            reduceKeyVal<CTA_SIZE>(cta_dist_buffer, bestDist, cta_ind_buffer, bestInd, threadIdx.x, less<int>());
-
-            if (threadIdx.x == 0)
-            {
-                int y = bestInd / search_window;
-                int x = bestInd - y * search_window;
-
-                velx = x - search_radius;
-                vely = y - search_radius;
-            }
-        }
-
-        __device__ __forceinline__ void operator()(PtrStepf velx, PtrStepf vely) const
-        {
-            int tbx = blockIdx.x * TILE_COLS;
-            int tby = blockIdx.y * TILE_ROWS;
-
-            int tex = ::min(tbx + TILE_COLS, I0.cols);
-            int tey = ::min(tby + TILE_ROWS, I0.rows);
-
-            PtrStepi col_sums;
-            col_sums.data = buffer.ptr(I0.cols + blockIdx.x * block_window) + blockIdx.y * search_window * search_window;
-            col_sums.step = buffer.step;
-
-            PtrStepi up_col_sums;
-            up_col_sums.data = buffer.data + blockIdx.y * search_window * search_window;
-            up_col_sums.step = buffer.step;
-
-            extern __shared__ int dist_sums[]; //search_window * search_window
-
-            int first = 0;
-
-            for (int i = tby; i < tey; ++i)
-            {
-                for (int j = tbx; j < tex; ++j)
-                {
-                    __syncthreads();
-
-                    if (j == tbx)
-                    {
-                        initSums_BruteForce(i, j, dist_sums, col_sums, up_col_sums);
-                        first = 0;
-                    }
-                    else
-                    {
-                        if (i == tby)
-                          shiftRight_FirstRow(i, j, first, dist_sums, col_sums, up_col_sums);
-                        else
-                          shiftRight_UpSums(i, j, first, dist_sums, col_sums, up_col_sums);
-
-                        first = (first + 1) % block_window;
-                    }
-
-                    __syncthreads();
-
-                    convolve_window(i, j, dist_sums, velx(i, j), vely(i, j));
-                }
-            }
-        }
-
-    };
-
-    template<typename T> __global__ void optflowbm_fast_kernel(const FastOptFlowBM<T> fbm, PtrStepf velx, PtrStepf vely)
-    {
-        fbm(velx, vely);
-    }
-
-    void get_buffer_size(int src_cols, int src_rows, int search_window, int block_window, int& buffer_cols, int& buffer_rows)
-    {
-        dim3 grid(divUp(src_cols, TILE_COLS), divUp(src_rows, TILE_ROWS));
-
-        buffer_cols = search_window * search_window * grid.y;
-        buffer_rows = src_cols + block_window * grid.x;
-    }
-
-    template <typename T>
-    void calc(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream)
-    {
-        FastOptFlowBM<T> fbm(search_window, block_window, I0, I1, buffer);
-
-        dim3 block(CTA_SIZE, 1);
-        dim3 grid(divUp(I0.cols, TILE_COLS), divUp(I0.rows, TILE_ROWS));
-
-        size_t smem = search_window * search_window * sizeof(int);
-
-        optflowbm_fast_kernel<<<grid, block, smem, stream>>>(fbm, velx, vely);
-        cudaSafeCall ( cudaGetLastError () );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template void calc<uchar>(PtrStepSzb I0, PtrStepSzb I1, PtrStepSzf velx, PtrStepSzf vely, PtrStepi buffer, int search_window, int block_window, cudaStream_t stream);
-}
-
-#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -1,220 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace optical_flow
-    {
-        #define NEEDLE_MAP_SCALE 16
-        #define NUM_VERTS_PER_ARROW 6
-
-        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
-        {
-            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
-
-            volatile float* u_col_sum = smem;
-            volatile float* v_col_sum = u_col_sum + NEEDLE_MAP_SCALE;
-
-            const int x = blockIdx.x * NEEDLE_MAP_SCALE + threadIdx.x;
-            const int y = blockIdx.y * NEEDLE_MAP_SCALE;
-
-            u_col_sum[threadIdx.x] = 0;
-            v_col_sum[threadIdx.x] = 0;
-
-            #pragma unroll
-            for(int i = 0; i < NEEDLE_MAP_SCALE; ++i)
-            {
-                u_col_sum[threadIdx.x] += u(::min(y + i, u.rows - 1), x);
-                v_col_sum[threadIdx.x] += v(::min(y + i, u.rows - 1), x);
-            }
-
-            if (threadIdx.x < 8)
-            {
-                // now add the column sums
-                const uint X = threadIdx.x;
-
-                if (X | 0xfe == 0xfe)  // bit 0 is 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 1];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 1];
-                }
-
-                if (X | 0xfe == 0xfc) // bits 0 & 1 == 0
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 2];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 2];
-                }
-
-                if (X | 0xf8 == 0xf8)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 4];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 4];
-                }
-
-                if (X == 0)
-                {
-                    u_col_sum[threadIdx.x] += u_col_sum[threadIdx.x + 8];
-                    v_col_sum[threadIdx.x] += v_col_sum[threadIdx.x + 8];
-                }
-            }
-
-            if (threadIdx.x == 0)
-            {
-                const float coeff = 1.0f / (NEEDLE_MAP_SCALE * NEEDLE_MAP_SCALE);
-
-                u_col_sum[0] *= coeff;
-                v_col_sum[0] *= coeff;
-
-                u_avg(blockIdx.y, blockIdx.x) = u_col_sum[0];
-                v_avg(blockIdx.y, blockIdx.x) = v_col_sum[0];
-            }
-        }
-
-        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
-        {
-            const dim3 block(NEEDLE_MAP_SCALE);
-            const dim3 grid(u_avg.cols, u_avg.rows);
-
-            NeedleMapAverageKernel<<<grid, block>>>(u, v, u_avg, v_avg);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            // test - just draw a triangle at each pixel
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const float arrow_x = x * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-            const float arrow_y = y * NEEDLE_MAP_SCALE + NEEDLE_MAP_SCALE / 2.0f;
-
-            float3 v[NUM_VERTS_PER_ARROW];
-
-            if (x < u_avg.cols && y < u_avg.rows)
-            {
-                const float u_avg_val = u_avg(y, x);
-                const float v_avg_val = v_avg(y, x);
-
-                const float theta = ::atan2f(v_avg_val, u_avg_val);// + CV_PI;
-
-                float r = ::sqrtf(v_avg_val * v_avg_val + u_avg_val * u_avg_val);
-                r = fmin(14.0f * (r / max_flow), 14.0f);
-
-                v[0].z = 1.0f;
-                v[1].z = 0.7f;
-                v[2].z = 0.7f;
-                v[3].z = 0.7f;
-                v[4].z = 0.7f;
-                v[5].z = 1.0f;
-
-                v[0].x = arrow_x;
-                v[0].y = arrow_y;
-                v[5].x = arrow_x;
-                v[5].y = arrow_y;
-
-                v[2].x = arrow_x + r * ::cosf(theta);
-                v[2].y = arrow_y + r * ::sinf(theta);
-                v[3].x = v[2].x;
-                v[3].y = v[2].y;
-
-                r = ::fmin(r, 2.5f);
-
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
-
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
-
-                int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[0].x * xscale;
-                vertex_data[indx++] = v[0].y * yscale;
-                vertex_data[indx++] = v[0].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[1].x * xscale;
-                vertex_data[indx++] = v[1].y * yscale;
-                vertex_data[indx++] = v[1].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[2].x * xscale;
-                vertex_data[indx++] = v[2].y * yscale;
-                vertex_data[indx++] = v[2].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[3].x * xscale;
-                vertex_data[indx++] = v[3].y * yscale;
-                vertex_data[indx++] = v[3].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[4].x * xscale;
-                vertex_data[indx++] = v[4].y * yscale;
-                vertex_data[indx++] = v[4].z;
-
-                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
-                vertex_data[indx++] = v[5].x * xscale;
-                vertex_data[indx++] = v[5].y * yscale;
-                vertex_data[indx++] = v[5].z;
-            }
-        }
-
-        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
-        {
-            const dim3 block(16);
-            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));
-
-            NeedleMapVertexKernel<<<grid, block>>>(u_avg, v_avg, vertex_buffer, color_data, max_flow, xscale, yscale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -1,647 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-#define tx threadIdx.x
-#define ty threadIdx.y
-#define bx blockIdx.x
-#define by blockIdx.y
-#define bdx blockDim.x
-#define bdy blockDim.y
-
-#define BORDER_SIZE 5
-#define MAX_KSIZE_HALF 100
-
-namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-{
-    __constant__ float c_g[8];
-    __constant__ float c_xg[8];
-    __constant__ float c_xxg[8];
-    __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
-
-
-    template <int polyN>
-    __global__ void polynomialExpansion(
-            const int height, const int width, const PtrStepf src, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * (bdx - 2*polyN) + tx - polyN;
-
-        if (y < height)
-        {
-            extern __shared__ float smem[];
-            volatile float *row = smem + tx;
-            int xWarped = ::min(::max(x, 0), width - 1);
-
-            row[0] = src(y, xWarped) * c_g[0];
-            row[bdx] = 0.f;
-            row[2*bdx] = 0.f;
-
-            for (int k = 1; k <= polyN; ++k)
-            {
-                float t0 = src(::max(y - k, 0), xWarped);
-                float t1 = src(::min(y + k, height - 1), xWarped);
-
-                row[0] += c_g[k] * (t0 + t1);
-                row[bdx] += c_xg[k] * (t1 - t0);
-                row[2*bdx] += c_xxg[k] * (t0 + t1);
-            }
-
-            __syncthreads();
-
-            if (tx >= polyN && tx + polyN < bdx && x < width)
-            {
-                float b1 = c_g[0] * row[0];
-                float b3 = c_g[0] * row[bdx];
-                float b5 = c_g[0] * row[2*bdx];
-                float b2 = 0, b4 = 0, b6 = 0;
-
-                for (int k = 1; k <= polyN; ++k)
-                {
-                    b1 += (row[k] + row[-k]) * c_g[k];
-                    b4 += (row[k] + row[-k]) * c_xxg[k];
-                    b2 += (row[k] - row[-k]) * c_xg[k];
-                    b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
-                    b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
-                    b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
-                }
-
-                dst(y, xWarped) = b3*c_ig11;
-                dst(height + y, xWarped) = b2*c_ig11;
-                dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
-                dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
-                dst(4*height + y, xWarped) = b6*c_ig55;
-            }
-        }
-    }
-
-
-    void setPolynomialExpansionConsts(
-            int polyN, const float *g, const float *xg, const float *xxg,
-            float ig11, float ig03, float ig33, float ig55)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
-        cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
-    }
-
-
-    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
-    {
-        dim3 block(256);
-        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
-        int smem = 3 * block.x * sizeof(float);
-
-        if (polyN == 5)
-            polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
-        else if (polyN == 7)
-            polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __constant__ float c_border[BORDER_SIZE + 1];
-
-    __global__ void updateMatrices(
-            const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
-            const PtrStepf R0, const PtrStepf R1, PtrStepf M)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        if (y < height && x < width)
-        {
-            float dx = flowx(y, x);
-            float dy = flowy(y, x);
-            float fx = x + dx;
-            float fy = y + dy;
-
-            int x1 = floorf(fx);
-            int y1 = floorf(fy);
-            fx -= x1; fy -= y1;
-
-            float r2, r3, r4, r5, r6;
-
-            if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
-            {
-                float a00 = (1.f - fx) * (1.f - fy);
-                float a01 = fx * (1.f - fy);
-                float a10 = (1.f - fx) * fy;
-                float a11 = fx * fy;
-
-                r2 = a00 * R1(y1, x1) +
-                     a01 * R1(y1, x1 + 1) +
-                     a10 * R1(y1 + 1, x1) +
-                     a11 * R1(y1 + 1, x1 + 1);
-
-                r3 = a00 * R1(height + y1, x1) +
-                     a01 * R1(height + y1, x1 + 1) +
-                     a10 * R1(height + y1 + 1, x1) +
-                     a11 * R1(height + y1 + 1, x1 + 1);
-
-                r4 = a00 * R1(2*height + y1, x1) +
-                     a01 * R1(2*height + y1, x1 + 1) +
-                     a10 * R1(2*height + y1 + 1, x1) +
-                     a11 * R1(2*height + y1 + 1, x1 + 1);
-
-                r5 = a00 * R1(3*height + y1, x1) +
-                     a01 * R1(3*height + y1, x1 + 1) +
-                     a10 * R1(3*height + y1 + 1, x1) +
-                     a11 * R1(3*height + y1 + 1, x1 + 1);
-
-                r6 = a00 * R1(4*height + y1, x1) +
-                     a01 * R1(4*height + y1, x1 + 1) +
-                     a10 * R1(4*height + y1 + 1, x1) +
-                     a11 * R1(4*height + y1 + 1, x1 + 1);
-
-                r4 = (R0(2*height + y, x) + r4) * 0.5f;
-                r5 = (R0(3*height + y, x) + r5) * 0.5f;
-                r6 = (R0(4*height + y, x) + r6) * 0.25f;
-            }
-            else
-            {
-                r2 = r3 = 0.f;
-                r4 = R0(2*height + y, x);
-                r5 = R0(3*height + y, x);
-                r6 = R0(4*height + y, x) * 0.5f;
-            }
-
-            r2 = (R0(y, x) - r2) * 0.5f;
-            r3 = (R0(height + y, x) - r3) * 0.5f;
-
-            r2 += r4*dy + r6*dx;
-            r3 += r6*dy + r5*dx;
-
-            float scale =
-                    c_border[::min(x, BORDER_SIZE)] *
-                    c_border[::min(y, BORDER_SIZE)] *
-                    c_border[::min(width - x - 1, BORDER_SIZE)] *
-                    c_border[::min(height - y - 1, BORDER_SIZE)];
-
-            r2 *= scale; r3 *= scale; r4 *= scale;
-            r5 *= scale; r6 *= scale;
-
-            M(y, x) = r4*r4 + r6*r6;
-            M(height + y, x) = (r4 + r5)*r6;
-            M(2*height + y, x) = r5*r5 + r6*r6;
-            M(3*height + y, x) = r4*r2 + r6*r3;
-            M(4*height + y, x) = r6*r2 + r5*r3;
-        }
-    }
-
-
-    void setUpdateMatricesConsts()
-    {
-        static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
-        cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
-    }
-
-
-    void updateMatricesGpu(
-            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
-            PtrStepSzf M, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
-
-        updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __global__ void updateFlow(
-            const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        if (y < height && x < width)
-        {
-            float g11 = M(y, x);
-            float g12 = M(height + y, x);
-            float g22 = M(2*height + y, x);
-            float h1 = M(3*height + y, x);
-            float h2 = M(4*height + y, x);
-
-            float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
-
-            flowx(y, x) = (g11*h2 - g12*h1) * detInv;
-            flowy(y, x) = (g22*h1 - g12*h2) * detInv;
-        }
-    }
-
-
-    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
-    {
-        dim3 block(32, 8);
-        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
-
-        updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    /*__global__ void boxFilter(
-            const int height, const int width, const PtrStepf src,
-            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = ::min(::max(xExt, 0), width - 1);
-
-                row[i] = src(y, xExt);
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal passs
-                row += tx + ksizeHalf;
-                float res = row[0];
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    res += row[-i] + row[i];
-                dst(y, x) = res * boxAreaInv;
-            }
-        }
-    }
-
-
-    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        dim3 block(256);
-        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }*/
-
-
-    __global__ void boxFilter5(
-            const int height, const int width, const PtrStepf src,
-            const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-
-        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-        volatile float *row = smem + 5 * ty * smw;
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = ::min(::max(xExt, 0), width - 1);
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] = src(k*height + y, xExt);
-
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        row[k*smw + i] +=
-                                src(k*height + ::max(y - j, 0), xExt) +
-                                src(k*height + ::min(y + j, height - 1), xExt);
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal passs
-
-                row += tx + ksizeHalf;
-                float res[5];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    res[k] = row[k*smw];
-
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        res[k] += row[k*smw - i] + row[k*smw + i];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    dst(k*height + y, x) = res[k] * boxAreaInv;
-            }
-        }
-    }
-
-
-    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(256);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(128);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-
-        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
-        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
-
-    template <typename Border>
-    __global__ void gaussianBlur(
-            const int height, const int width, const PtrStepf src, const int ksizeHalf,
-            const Border b, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-        volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = b.idx_col(xExt);
-                row[i] = src(y, xExt) * c_gKer[0];
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    row[i] +=
-                            (src(b.idx_row_low(y - j), xExt) +
-                             src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal pass
-                row += tx + ksizeHalf;
-                float res = row[0] * c_gKer[0];
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    res += (row[-i] + row[i]) * c_gKer[i];
-                dst(y, x) = res;
-            }
-        }
-    }
-
-
-    void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
-    {
-        cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
-    }
-
-
-    template <typename Border>
-    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows;
-        int width = src.cols;
-
-        dim3 block(256);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
-        Border b(height, width);
-
-        gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void gaussianBlurGpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlurCaller<BrdReflect101<float> >,
-            gaussianBlurCaller<BrdReplicate<float> >,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-
-    template <typename Border>
-    __global__ void gaussianBlur5(
-            const int height, const int width, const PtrStepf src, const int ksizeHalf,
-            const Border b, PtrStepf dst)
-    {
-        const int y = by * bdy + ty;
-        const int x = bx * bdx + tx;
-
-        extern __shared__ float smem[];
-
-        const int smw = bdx + 2*ksizeHalf; // shared memory "width"
-        volatile float *row = smem + 5 * ty * smw;
-
-        if (y < height)
-        {
-            // Vertical pass
-            for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
-            {
-                int xExt = int(bx * bdx) + i - ksizeHalf;
-                xExt = b.idx_col(xExt);
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
-
-                for (int j = 1; j <= ksizeHalf; ++j)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        row[k*smw + i] +=
-                                (src(k*height + b.idx_row_low(y - j), xExt) +
-                                 src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
-            }
-
-            if (x < width)
-            {
-                __syncthreads();
-
-                // Horizontal pass
-
-                row += tx + ksizeHalf;
-                float res[5];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    res[k] = row[k*smw] * c_gKer[0];
-
-                for (int i = 1; i <= ksizeHalf; ++i)
-                    #pragma unroll
-                    for (int k = 0; k < 5; ++k)
-                        res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
-
-                #pragma unroll
-                for (int k = 0; k < 5; ++k)
-                    dst(k*height + y, x) = res[k];
-            }
-        }
-    }
-
-
-    template <typename Border, int blockDimX>
-    void gaussianBlur5Caller(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
-    {
-        int height = src.rows / 5;
-        int width = src.cols;
-
-        dim3 block(blockDimX);
-        dim3 grid(divUp(width, block.x), divUp(height, block.y));
-        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
-        Border b(height, width);
-
-        gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
-
-        cudaSafeCall(cudaGetLastError());
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    void gaussianBlur5Gpu(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlur5Caller<BrdReflect101<float>,256>,
-            gaussianBlur5Caller<BrdReplicate<float>,256>,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-    void gaussianBlur5Gpu_CC11(
-            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
-
-        static const caller_t callers[] =
-        {
-            gaussianBlur5Caller<BrdReflect101<float>,128>,
-            gaussianBlur5Caller<BrdReplicate<float>,128>,
-        };
-
-        callers[borderMode](src, ksizeHalf, dst, stream);
-    }
-
-}}}} // namespace cv { namespace gpu { namespace cudev { namespace optflow_farneback
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -1,424 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace orb
-    {
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // cull
-
-        int cull_gpu(int* loc, float* response, int size, int n_points)
-        {
-            thrust::device_ptr<int> loc_ptr(loc);
-            thrust::device_ptr<float> response_ptr(response);
-
-            thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
-
-            return n_points;
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // HarrisResponses
-
-        __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
-        {
-            __shared__ int smem0[8 * 32];
-            __shared__ int smem1[8 * 32];
-            __shared__ int smem2[8 * 32];
-
-            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints)
-            {
-                const short2 loc = loc_[ptidx];
-
-                const int r = blockSize / 2;
-                const int x0 = loc.x - r;
-                const int y0 = loc.y - r;
-
-                int a = 0, b = 0, c = 0;
-
-                for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
-                {
-                    const int i = ind / blockSize;
-                    const int j = ind % blockSize;
-
-                    int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
-                        (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
-                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
-
-                    int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
-                        (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
-                        (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
-
-                    a += Ix * Ix;
-                    b += Iy * Iy;
-                    c += Ix * Iy;
-                }
-
-                int* srow0 = smem0 + threadIdx.y * blockDim.x;
-                int* srow1 = smem1 + threadIdx.y * blockDim.x;
-                int* srow2 = smem2 + threadIdx.y * blockDim.x;
-
-                plus<int> op;
-                reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
-
-                if (threadIdx.x == 0)
-                {
-                    float scale = (1 << 2) * blockSize * 255.0f;
-                    scale = 1.0f / scale;
-                    const float scale_sq_sq = scale * scale * scale * scale;
-
-                    response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
-                }
-            }
-        }
-
-        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.y);
-
-            HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // IC_Angle
-
-        __constant__ int c_u_max[32];
-
-        void loadUMax(const int* u_max, int count)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
-        }
-
-        __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
-        {
-            __shared__ int smem0[8 * 32];
-            __shared__ int smem1[8 * 32];
-
-            int* srow0 = smem0 + threadIdx.y * blockDim.x;
-            int* srow1 = smem1 + threadIdx.y * blockDim.x;
-
-            plus<int> op;
-
-            const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints)
-            {
-                int m_01 = 0, m_10 = 0;
-
-                const short2 loc = loc_[ptidx];
-
-                // Treat the center line differently, v=0
-                for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
-                    m_10 += u * image(loc.y, loc.x + u);
-
-                reduce<32>(srow0, m_10, threadIdx.x, op);
-
-                for (int v = 1; v <= half_k; ++v)
-                {
-                    // Proceed over the two lines
-                    int v_sum = 0;
-                    int m_sum = 0;
-                    const int d = c_u_max[v];
-
-                    for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
-                    {
-                        int val_plus = image(loc.y + v, loc.x + u);
-                        int val_minus = image(loc.y - v, loc.x + u);
-
-                        v_sum += (val_plus - val_minus);
-                        m_sum += u * (val_plus + val_minus);
-                    }
-
-                    reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
-
-                    m_10 += m_sum;
-                    m_01 += v * v_sum;
-                }
-
-                if (threadIdx.x == 0)
-                {
-                    float kp_dir = ::atan2f((float)m_01, (float)m_10);
-                    kp_dir += (kp_dir < 0) * (2.0f * CV_PI);
-                    kp_dir *= 180.0f / CV_PI;
-
-                    angle[ptidx] = kp_dir;
-                }
-            }
-        }
-
-        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.y);
-
-            IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // computeOrbDescriptor
-
-        template <int WTA_K> struct OrbDescriptor;
-
-        #define GET_VALUE(idx) \
-            img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
-                loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
-
-        template <> struct OrbDescriptor<2>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 16 * i;
-                pattern_y += 16 * i;
-
-                int t0, t1, val;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
-                val = t0 < t1;
-
-                t0 = GET_VALUE(2); t1 = GET_VALUE(3);
-                val |= (t0 < t1) << 1;
-
-                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
-                val |= (t0 < t1) << 2;
-
-                t0 = GET_VALUE(6); t1 = GET_VALUE(7);
-                val |= (t0 < t1) << 3;
-
-                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
-                val |= (t0 < t1) << 4;
-
-                t0 = GET_VALUE(10); t1 = GET_VALUE(11);
-                val |= (t0 < t1) << 5;
-
-                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
-                val |= (t0 < t1) << 6;
-
-                t0 = GET_VALUE(14); t1 = GET_VALUE(15);
-                val |= (t0 < t1) << 7;
-
-                return val;
-            }
-        };
-
-        template <> struct OrbDescriptor<3>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 12 * i;
-                pattern_y += 12 * i;
-
-                int t0, t1, t2, val;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
-                val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
-
-                t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
-
-                t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
-
-                t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
-                val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
-
-                return val;
-            }
-        };
-
-        template <> struct OrbDescriptor<4>
-        {
-            __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
-            {
-                pattern_x += 16 * i;
-                pattern_y += 16 * i;
-
-                int t0, t1, t2, t3, k, val;
-                int a, b;
-
-                t0 = GET_VALUE(0); t1 = GET_VALUE(1);
-                t2 = GET_VALUE(2); t3 = GET_VALUE(3);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val = k;
-
-                t0 = GET_VALUE(4); t1 = GET_VALUE(5);
-                t2 = GET_VALUE(6); t3 = GET_VALUE(7);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 2;
-
-                t0 = GET_VALUE(8); t1 = GET_VALUE(9);
-                t2 = GET_VALUE(10); t3 = GET_VALUE(11);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 4;
-
-                t0 = GET_VALUE(12); t1 = GET_VALUE(13);
-                t2 = GET_VALUE(14); t3 = GET_VALUE(15);
-                a = 0, b = 2;
-                if( t1 > t0 ) t0 = t1, a = 1;
-                if( t3 > t2 ) t2 = t3, b = 3;
-                k = t0 > t2 ? a : b;
-                val |= k << 6;
-
-                return val;
-            }
-        };
-
-        #undef GET_VALUE
-
-        template <int WTA_K>
-        __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
-            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
-        {
-            const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
-            const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (ptidx < npoints && descidx < dsize)
-            {
-                float angle = angle_[ptidx];
-                angle *= (float)(CV_PI / 180.f);
-
-                float sina, cosa;
-                ::sincosf(angle, &sina, &cosa);
-
-                desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
-            }
-        }
-
-        void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
-            const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-
-            dim3 grid;
-            grid.x = divUp(dsize, block.x);
-            grid.y = divUp(npoints, block.y);
-
-            switch (WTA_K)
-            {
-            case 2:
-                computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-
-            case 3:
-                computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-
-            case 4:
-                computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
-                break;
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ////////////////////////////////////////////////////////////////////////////////////////////////////////
-        // mergeLocation
-
-        __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
-        {
-            const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
-
-            if (ptidx < npoints)
-            {
-                short2 loc = loc_[ptidx];
-
-                x[ptidx] = loc.x * scale;
-                y[ptidx] = loc.y * scale;
-            }
-        }
-
-        void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
-        {
-            dim3 block(256);
-
-            dim3 grid;
-            grid.x = divUp(npoints, block.x);
-
-            mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -1,228 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
-
-            __shared__ work_t smem[256 + 4];
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y;
-
-            const int src_y = 2 * y;
-
-            if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, x);
-                    sum = sum + 0.25f   * src(src_y - 1, x);
-                    sum = sum + 0.375f  * src(src_y    , x);
-                    sum = sum + 0.25f   * src(src_y + 1, x);
-                    sum = sum + 0.0625f * src(src_y + 2, x);
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, left_x);
-                    sum = sum + 0.25f   * src(src_y - 1, left_x);
-                    sum = sum + 0.375f  * src(src_y    , left_x);
-                    sum = sum + 0.25f   * src(src_y + 1, left_x);
-                    sum = sum + 0.0625f * src(src_y + 2, left_x);
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(src_y - 2, right_x);
-                    sum = sum + 0.25f   * src(src_y - 1, right_x);
-                    sum = sum + 0.375f  * src(src_y    , right_x);
-                    sum = sum + 0.25f   * src(src_y + 1, right_x);
-                    sum = sum + 0.0625f * src(src_y + 2, right_x);
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-            else
-            {
-                {
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
-
-                    smem[2 + threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x < 2)
-                {
-                    const int left_x = x - 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
-
-                    smem[threadIdx.x] = sum;
-                }
-
-                if (threadIdx.x > 253)
-                {
-                    const int right_x = x + 2;
-
-                    work_t sum;
-
-                    sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
-                    sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
-                    sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
-                    sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
-
-                    smem[4 + threadIdx.x] = sum;
-                }
-            }
-
-            __syncthreads();
-
-            if (threadIdx.x < 128)
-            {
-                const int tid2 = threadIdx.x * 2;
-
-                work_t sum;
-
-                sum =       0.0625f * smem[2 + tid2 - 2];
-                sum = sum + 0.25f   * smem[2 + tid2 - 1];
-                sum = sum + 0.375f  * smem[2 + tid2    ];
-                sum = sum + 0.25f   * smem[2 + tid2 + 1];
-                sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-                const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-                if (dst_x < dst_cols)
-                    dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-            }
-        }
-
-        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(256);
-            const dim3 grid(divUp(src.cols, block.x), dst.rows);
-
-            B<T> b(src.rows, src.cols);
-
-            pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -1,196 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            __shared__ sum_t s_srcPatch[10][10];
-            __shared__ sum_t s_dstPatch[20][16];
-
-            if (threadIdx.x < 10 && threadIdx.y < 10)
-            {
-                int srcx = static_cast<int>((blockIdx.x * blockDim.x) / 2 + threadIdx.x) - 1;
-                int srcy = static_cast<int>((blockIdx.y * blockDim.y) / 2 + threadIdx.y) - 1;
-
-                srcx = ::abs(srcx);
-                srcx = ::min(src.cols - 1, srcx);
-
-                srcy = ::abs(srcy);
-                srcy = ::min(src.rows - 1, srcy);
-
-                s_srcPatch[threadIdx.y][threadIdx.x] = saturate_cast<sum_t>(src(srcy, srcx));
-            }
-
-            __syncthreads();
-
-            sum_t sum = VecTraits<sum_t>::all(0);
-
-            const int evenFlag = static_cast<int>((threadIdx.x & 1) == 0);
-            const int oddFlag  = static_cast<int>((threadIdx.x & 1) != 0);
-            const bool eveny = ((threadIdx.y & 1) == 0);
-            const int tidx = threadIdx.x;
-
-            if (eveny)
-            {
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 2) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx - 1) >> 1)];
-                sum = sum + (evenFlag * 0.375f ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx    ) >> 1)];
-                sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 1) >> 1)];
-                sum = sum + (evenFlag * 0.0625f) * s_srcPatch[1 + (threadIdx.y >> 1)][1 + ((tidx + 2) >> 1)];
-            }
-
-            s_dstPatch[2 + threadIdx.y][threadIdx.x] = sum;
-
-            if (threadIdx.y < 2)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[0][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[0][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[0][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[threadIdx.y][threadIdx.x] = sum;
-            }
-
-            if (threadIdx.y > 13)
-            {
-                sum = VecTraits<sum_t>::all(0);
-
-                if (eveny)
-                {
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx - 2) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx - 1) >> 1)];
-                    sum = sum + (evenFlag * 0.375f ) * s_srcPatch[9][1 + ((tidx    ) >> 1)];
-                    sum = sum + ( oddFlag * 0.25f  ) * s_srcPatch[9][1 + ((tidx + 1) >> 1)];
-                    sum = sum + (evenFlag * 0.0625f) * s_srcPatch[9][1 + ((tidx + 2) >> 1)];
-                }
-
-                s_dstPatch[4 + threadIdx.y][threadIdx.x] = sum;
-            }
-
-            __syncthreads();
-
-            sum = VecTraits<sum_t>::all(0);
-
-            const int tidy = threadIdx.y;
-
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy - 2][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy - 1][threadIdx.x];
-            sum = sum + 0.375f  * s_dstPatch[2 + tidy    ][threadIdx.x];
-            sum = sum + 0.25f   * s_dstPatch[2 + tidy + 1][threadIdx.x];
-            sum = sum + 0.0625f * s_dstPatch[2 + tidy + 2][threadIdx.x];
-
-            if (x < dst.cols && y < dst.rows)
-                dst(y, x) = saturate_cast<T>(4.0f * sum);
-        }
-
-        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
-        {
-            const dim3 block(16, 16);
-            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            pyrUp<<<grid, block, 0, stream>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
-        {
-            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -1,560 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/utility.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace pyrlk
-{
-    __constant__ int c_winSize_x;
-    __constant__ int c_winSize_y;
-    __constant__ int c_halfWin_x;
-    __constant__ int c_halfWin_y;
-    __constant__ int c_iters;
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    template <int cn> struct Tex_I;
-    template <> struct Tex_I<1>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_If, x, y);
-        }
-    };
-    template <> struct Tex_I<4>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_If4, x, y);
-        }
-    };
-
-    template <int cn> struct Tex_J;
-    template <> struct Tex_J<1>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_Jf, x, y);
-        }
-    };
-    template <> struct Tex_J<4>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_Jf4, x, y);
-        }
-    };
-
-    __device__ __forceinline__ void accum(float& dst, float val)
-    {
-        dst += val;
-    }
-    __device__ __forceinline__ void accum(float& dst, const float4& val)
-    {
-        dst += val.x + val.y + val.z;
-    }
-
-    __device__ __forceinline__ float abs_(float a)
-    {
-        return ::fabsf(a);
-    }
-    __device__ __forceinline__ float4 abs_(const float4& a)
-    {
-        return abs(a);
-    }
-
-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
-    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
-    {
-    #if __CUDA_ARCH__ <= 110
-        const int BLOCK_SIZE = 128;
-    #else
-        const int BLOCK_SIZE = 256;
-    #endif
-
-        __shared__ float smem1[BLOCK_SIZE];
-        __shared__ float smem2[BLOCK_SIZE];
-        __shared__ float smem3[BLOCK_SIZE];
-
-        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
-        float2 prevPt = prevPts[blockIdx.x];
-        prevPt.x *= (1.0f / (1 << level));
-        prevPt.y *= (1.0f / (1 << level));
-
-        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
-        {
-            if (tid == 0 && level == 0)
-                status[blockIdx.x] = 0;
-
-            return;
-        }
-
-        prevPt.x -= c_halfWin_x;
-        prevPt.y -= c_halfWin_y;
-
-        // extract the patch from the first image, compute covariation matrix of derivatives
-
-        float A11 = 0;
-        float A12 = 0;
-        float A22 = 0;
-
-        typedef typename TypeVec<float, cn>::vec_type work_type;
-
-        work_type I_patch   [PATCH_Y][PATCH_X];
-        work_type dIdx_patch[PATCH_Y][PATCH_X];
-        work_type dIdy_patch[PATCH_Y][PATCH_X];
-
-        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
-        {
-            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
-            {
-                float x = prevPt.x + xBase + 0.5f;
-                float y = prevPt.y + yBase + 0.5f;
-
-                I_patch[i][j] = Tex_I<cn>::read(x, y);
-
-                // Sharr Deriv
-
-                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
-
-                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
-                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
-
-                dIdx_patch[i][j] = dIdx;
-                dIdy_patch[i][j] = dIdy;
-
-                accum(A11, dIdx * dIdx);
-                accum(A12, dIdx * dIdy);
-                accum(A22, dIdy * dIdy);
-            }
-        }
-
-        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
-
-    #if __CUDA_ARCH__ >= 300
-        if (tid == 0)
-        {
-            smem1[0] = A11;
-            smem2[0] = A12;
-            smem3[0] = A22;
-        }
-    #endif
-
-        __syncthreads();
-
-        A11 = smem1[0];
-        A12 = smem2[0];
-        A22 = smem3[0];
-
-        float D = A11 * A22 - A12 * A12;
-
-        if (D < numeric_limits<float>::epsilon())
-        {
-            if (tid == 0 && level == 0)
-                status[blockIdx.x] = 0;
-
-            return;
-        }
-
-        D = 1.f / D;
-
-        A11 *= D;
-        A12 *= D;
-        A22 *= D;
-
-        float2 nextPt = nextPts[blockIdx.x];
-        nextPt.x *= 2.f;
-        nextPt.y *= 2.f;
-
-        nextPt.x -= c_halfWin_x;
-        nextPt.y -= c_halfWin_y;
-
-        for (int k = 0; k < c_iters; ++k)
-        {
-            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
-            {
-                if (tid == 0 && level == 0)
-                    status[blockIdx.x] = 0;
-
-                return;
-            }
-
-            float b1 = 0;
-            float b2 = 0;
-
-            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-            {
-                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                {
-                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                    work_type diff = (J_val - I_val) * 32.0f;
-
-                    accum(b1, diff * dIdx_patch[i][j]);
-                    accum(b2, diff * dIdy_patch[i][j]);
-                }
-            }
-
-            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
-
-        #if __CUDA_ARCH__ >= 300
-            if (tid == 0)
-            {
-                smem1[0] = b1;
-                smem2[0] = b2;
-            }
-        #endif
-
-            __syncthreads();
-
-            b1 = smem1[0];
-            b2 = smem2[0];
-
-            float2 delta;
-            delta.x = A12 * b2 - A22 * b1;
-            delta.y = A12 * b1 - A11 * b2;
-
-            nextPt.x += delta.x;
-            nextPt.y += delta.y;
-
-            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                break;
-        }
-
-        float errval = 0;
-        if (calcErr)
-        {
-            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
-            {
-                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
-                {
-                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
-
-                    work_type diff = J_val - I_val;
-
-                    accum(errval, abs_(diff));
-                }
-            }
-
-            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
-        }
-
-        if (tid == 0)
-        {
-            nextPt.x += c_halfWin_x;
-            nextPt.y += c_halfWin_y;
-
-            nextPts[blockIdx.x] = nextPt;
-
-            if (calcErr)
-                err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
-        }
-    }
-
-    template <int cn, int PATCH_X, int PATCH_Y>
-    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                       int level, dim3 block, cudaStream_t stream)
-    {
-        dim3 grid(ptcount);
-
-        if (level == 0 && err)
-            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-        else
-            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <bool calcErr>
-    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
-    {
-        extern __shared__ int smem[];
-
-        const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
-        const int patchHeight = blockDim.y + 2 * c_halfWin_y;
-
-        int* I_patch = smem;
-        int* dIdx_patch = I_patch + patchWidth * patchHeight;
-        int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
-
-        const int xBase = blockIdx.x * blockDim.x;
-        const int yBase = blockIdx.y * blockDim.y;
-
-        for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
-        {
-            for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
-            {
-                float x = xBase - c_halfWin_x + j + 0.5f;
-                float y = yBase - c_halfWin_y + i + 0.5f;
-
-                I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
-
-                // Sharr Deriv
-
-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
-
-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
-                                                (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
-            }
-        }
-
-        __syncthreads();
-
-        const int x = xBase + threadIdx.x;
-        const int y = yBase + threadIdx.y;
-
-        if (x >= cols || y >= rows)
-            return;
-
-        int A11i = 0;
-        int A12i = 0;
-        int A22i = 0;
-
-        for (int i = 0; i < c_winSize_y; ++i)
-        {
-            for (int j = 0; j < c_winSize_x; ++j)
-            {
-                int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                A11i += dIdx * dIdx;
-                A12i += dIdx * dIdy;
-                A22i += dIdy * dIdy;
-            }
-        }
-
-        float A11 = A11i;
-        float A12 = A12i;
-        float A22 = A22i;
-
-        float D = A11 * A22 - A12 * A12;
-
-        if (D < numeric_limits<float>::epsilon())
-        {
-            if (calcErr)
-                err(y, x) = numeric_limits<float>::max();
-
-            return;
-        }
-
-        D = 1.f / D;
-
-        A11 *= D;
-        A12 *= D;
-        A22 *= D;
-
-        float2 nextPt;
-        nextPt.x = x + prevU(y/2, x/2) * 2.0f;
-        nextPt.y = y + prevV(y/2, x/2) * 2.0f;
-
-        for (int k = 0; k < c_iters; ++k)
-        {
-            if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
-            {
-                if (calcErr)
-                    err(y, x) = numeric_limits<float>::max();
-
-                return;
-            }
-
-            int b1 = 0;
-            int b2 = 0;
-
-            for (int i = 0; i < c_winSize_y; ++i)
-            {
-                for (int j = 0; j < c_winSize_x; ++j)
-                {
-                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                    int diff = (J - I) * 32;
-
-                    int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-                    int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
-
-                    b1 += diff * dIdx;
-                    b2 += diff * dIdy;
-                }
-            }
-
-            float2 delta;
-            delta.x = A12 * b2 - A22 * b1;
-            delta.y = A12 * b1 - A11 * b2;
-
-            nextPt.x += delta.x;
-            nextPt.y += delta.y;
-
-            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
-                break;
-        }
-
-        u(y, x) = nextPt.x - x;
-        v(y, x) = nextPt.y - y;
-
-        if (calcErr)
-        {
-            int errval = 0;
-
-            for (int i = 0; i < c_winSize_y; ++i)
-            {
-                for (int j = 0; j < c_winSize_x; ++j)
-                {
-                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
-
-                    errval += ::abs(J - I);
-                }
-            }
-
-            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
-        }
-    }
-
-    void loadConstants(int2 winSize, int iters)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
-
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
-    }
-
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
-
-        static const func_t funcs[5][5] =
-        {
-            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
-            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
-            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
-            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
-            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
-        };
-
-        bindTexture(&tex_If, I);
-        bindTexture(&tex_Jf, J);
-
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-            level, block, stream);
-    }
-
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
-
-        static const func_t funcs[5][5] =
-        {
-            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
-            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
-            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
-            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
-            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
-        };
-
-        bindTexture(&tex_If4, I);
-        bindTexture(&tex_Jf4, J);
-
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-            level, block, stream);
-    }
-
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
-    {
-        dim3 block(16, 16);
-        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-
-        bindTexture(&tex_Ib, I);
-        bindTexture(&tex_Jf, J);
-
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        const int patchWidth  = block.x + 2 * halfWin.x;
-        const int patchHeight = block.y + 2 * halfWin.y;
-        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
-
-        if (err.data)
-        {
-            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-        else
-        {
-            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@@ -1,274 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = mapx.ptr(y)[x];
-                const float ycoo = mapy.ptr(y)[x];
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, PtrStepSz<T> dst, const float* borderValue, bool)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
-
-        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
-            {
-                if (stream == 0)
-                    RemapDispatcherNonStream<Filter, B, T>::call(src, srcWhole, xoff, yoff, mapx, mapy, dst, borderValue, cc20);
-                else
-                    RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc20);
-            }
-        };
-
-        template <typename T> void remap_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-            PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap,
-                PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
-
-            static const caller_t callers[3][5] =
-            {
-                {
-                    RemapDispatcher<PointFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<PointFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<PointFilter, BrdConstant, T>::call,
-                    RemapDispatcher<PointFilter, BrdReflect, T>::call,
-                    RemapDispatcher<PointFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<LinearFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<LinearFilter, BrdConstant, T>::call,
-                    RemapDispatcher<LinearFilter, BrdReflect, T>::call,
-                    RemapDispatcher<LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    RemapDispatcher<CubicFilter, BrdReflect101, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReplicate, T>::call,
-                    RemapDispatcher<CubicFilter, BrdConstant, T>::call,
-                    RemapDispatcher<CubicFilter, BrdReflect, T>::call,
-                    RemapDispatcher<CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
-        }
-
-        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -1,302 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include <cfloat>
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float xcoo = x * fx;
-                const float ycoo = y * fy;
-
-                dst(y, x) = saturate_cast<T>(src(ycoo, xcoo));
-            }
-        }
-
-        template <typename Ptr2D, typename T> __global__ void resize_area(const Ptr2D src, float fx, float fy, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                dst(y, x) = saturate_cast<T>(src(y, x));
-            }
-        }
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc, fx, fy);
-
-                resize<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <typename T> struct ResizeDispatcherStream<IntegerAreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
-                resize_area<<<grid, block, 0, stream>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                BrdReplicate<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
-                resize<<<grid, block>>>(filteredSrc, fx, fy, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
-            texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_resize_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                const int xoff; \
-                const int yoff; \
-                __host__ tex_resize_ ## type ## _reader(int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_resize_ ## type, x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct ResizeDispatcherNonStream<Filter, type > \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz< type > dst) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_resize_ ## type, srcWhole); \
-                    tex_resize_ ## type ## _reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter<tex_resize_ ## type ## _reader> filteredSrc(texSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate< type > brd(src.rows, src.cols); \
-                        BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > brdSrc(texSrc, brd); \
-                        Filter< BorderReader<tex_resize_ ## type ## _reader, BrdReplicate< type > > > filteredSrc(brdSrc); \
-                        resize<<<grid, block>>>(filteredSrc, fx, fy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-        OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX
-
-        template <template <typename> class Filter, typename T> struct ResizeDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                if (stream == 0)
-                    ResizeDispatcherNonStream<Filter, T>::call(src, srcWhole, xoff, yoff, fx, fy, dst);
-                else
-                    ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> struct ResizeDispatcher<AreaFilter, T>
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream)
-            {
-                (void)srcWhole;
-                (void)xoff;
-                (void)yoff;
-                int iscale_x = (int)round(fx);
-                int iscale_y = (int)round(fy);
-
-                if( std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
-                    ResizeDispatcherStream<IntegerAreaFilter, T>::call(src, fx, fy, dst, stream);
-                else
-                    ResizeDispatcherStream<AreaFilter, T>::call(src, fx, fy, dst, stream);
-            }
-        };
-
-        template <typename T> void resize_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy,
-            PtrStepSzb dst, int interpolation, cudaStream_t stream)
-        {
-            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSz<T> dst, cudaStream_t stream);
-
-            static const caller_t callers[4] =
-            {
-                ResizeDispatcher<PointFilter, T>::call,
-                ResizeDispatcher<LinearFilter, T>::call,
-                ResizeDispatcher<CubicFilter, T>::call,
-                ResizeDispatcher<AreaFilter, T>::call
-            };
-            // chenge to linear if area interpolation upscaling
-            if (interpolation == 3 && (fx <= 1.f || fy <= 1.f))
-                interpolation = 1;
-
-            callers[interpolation](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, fx, fy,
-                static_cast< PtrStepSz<T> >(dst), stream);
-        }
-
-        template void resize_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        //template void resize_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template void resize_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        //template void resize_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-        template void resize_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float fx, float fy, PtrStepSzb dst, int interpolation, cudaStream_t stream);
-
-        template<typename T> struct scan_traits{};
-
-        template<> struct scan_traits<uchar>
-        {
-            typedef float scan_line_type;
-        };
-
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
@@ -1,175 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace video_encoding
-    {
-        __device__ __forceinline__ void rgbtoy(const uchar b, const uchar g, const uchar r, uchar& y)
-        {
-            y = static_cast<uchar>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
-        }
-
-        __device__ __forceinline__ void rgbtoyuv(const uchar b, const uchar g, const uchar r, uchar& y, uchar& u, uchar& v)
-        {
-            rgbtoy(b, g, r, y);
-            u = static_cast<uchar>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
-            v = static_cast<uchar>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
-        }
-
-        __global__ void Gray_to_YV12(const PtrStepSzb src, PtrStepb dst)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-            if (x + 1 >= src.cols || y + 1 >= src.rows)
-                return;
-
-            // get pointers to the data
-            const size_t planeSize = src.rows * dst.step;
-            PtrStepb y_plane(dst.data, dst.step);
-            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
-
-            uchar pix;
-            uchar y_val, u_val, v_val;
-
-            pix = src(y, x);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y, x) = y_val;
-
-            pix = src(y, x + 1);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y, x + 1) = y_val;
-
-            pix = src(y + 1, x);
-            rgbtoy(pix, pix, pix, y_val);
-            y_plane(y + 1, x) = y_val;
-
-            pix = src(y + 1, x + 1);
-            rgbtoyuv(pix, pix, pix, y_val, u_val, v_val);
-            y_plane(y + 1, x + 1) = y_val;
-            u_plane(y / 2, x / 2) = u_val;
-            v_plane(y / 2, x / 2) = v_val;
-        }
-
-        template <typename T>
-        __global__ void BGR_to_YV12(const PtrStepSz<T> src, PtrStepb dst)
-        {
-            const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-            const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
-
-            if (x + 1 >= src.cols || y + 1 >= src.rows)
-                return;
-
-            // get pointers to the data
-            const size_t planeSize = src.rows * dst.step;
-            PtrStepb y_plane(dst.data, dst.step);
-            PtrStepb u_plane(y_plane.data + planeSize, dst.step / 2);
-            PtrStepb v_plane(u_plane.data + (planeSize / 4), dst.step / 2);
-
-            T pix;
-            uchar y_val, u_val, v_val;
-
-            pix = src(y, x);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y, x) = y_val;
-
-            pix = src(y, x + 1);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y, x + 1) = y_val;
-
-            pix = src(y + 1, x);
-            rgbtoy(pix.z, pix.y, pix.x, y_val);
-            y_plane(y + 1, x) = y_val;
-
-            pix = src(y + 1, x + 1);
-            rgbtoyuv(pix.z, pix.y, pix.x, y_val, u_val, v_val);
-            y_plane(y + 1, x + 1) = y_val;
-            u_plane(y / 2, x / 2) = u_val;
-            v_plane(y / 2, x / 2) = v_val;
-        }
-
-        void Gray_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-            Gray_to_YV12<<<grid, block>>>(src, dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <int cn>
-        void BGR_to_YV12_caller(const PtrStepSzb src, PtrStepb dst)
-        {
-            typedef typename TypeVec<uchar, cn>::vec_type src_t;
-
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x * 2), divUp(src.rows, block.y * 2));
-
-            BGR_to_YV12<<<grid, block>>>(static_cast< PtrStepSz<src_t> >(src), dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void YV12_gpu(const PtrStepSzb src, int cn, PtrStepSzb dst)
-        {
-            typedef void (*func_t)(const PtrStepSzb src, PtrStepb dst);
-
-            static const func_t funcs[] =
-            {
-                0, Gray_to_YV12_caller, 0, BGR_to_YV12_caller<3>, BGR_to_YV12_caller<4>
-            };
-
-            funcs[cn](src, dst);
-        }
-    }
-}}}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.0.cu
+++ b/modules/gpu/src/cuda/row_filter.0.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<uchar, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.1.cu
+++ b/modules/gpu/src/cuda/row_filter.1.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<uchar3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.10.cu
+++ b/modules/gpu/src/cuda/row_filter.10.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<unsigned short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.11.cu
+++ b/modules/gpu/src/cuda/row_filter.11.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<ushort3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.12.cu
+++ b/modules/gpu/src/cuda/row_filter.12.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<ushort4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.13.cu
+++ b/modules/gpu/src/cuda/row_filter.13.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<int3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.14.cu
+++ b/modules/gpu/src/cuda/row_filter.14.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<int4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.2.cu
+++ b/modules/gpu/src/cuda/row_filter.2.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<uchar4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.3.cu
+++ b/modules/gpu/src/cuda/row_filter.3.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<short3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.4.cu
+++ b/modules/gpu/src/cuda/row_filter.4.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<int, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.5.cu
+++ b/modules/gpu/src/cuda/row_filter.5.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<float, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.6.cu
+++ b/modules/gpu/src/cuda/row_filter.6.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<float3, float3>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.7.cu
+++ b/modules/gpu/src/cuda/row_filter.7.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<float4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.8.cu
+++ b/modules/gpu/src/cuda/row_filter.8.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<short, float>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.9.cu
+++ b/modules/gpu/src/cuda/row_filter.9.cu
@@ -1,52 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "row_filter.h"
-
-namespace filter
-{
-    template void linearRow<short4, float4>(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-}
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/row_filter.h
+++ b/modules/gpu/src/cuda/row_filter.h
@@ -1,371 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-namespace row_filter
-{
-    #define MAX_KERNEL_SIZE 32
-
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
-    template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
-    {
-        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
-            const int BLOCK_DIM_X = 32;
-            const int BLOCK_DIM_Y = 8;
-            const int PATCH_PER_BLOCK = 4;
-            const int HALO_SIZE = 1;
-        #else
-            const int BLOCK_DIM_X = 32;
-            const int BLOCK_DIM_Y = 4;
-            const int PATCH_PER_BLOCK = 4;
-            const int HALO_SIZE = 1;
-        #endif
-
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
-
-        __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X];
-
-        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
-
-        if (y >= src.rows)
-            return;
-
-        const T* src_row = src.ptr(y);
-
-        const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x;
-
-        if (blockIdx.x > 0)
-        {
-            //Load left halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]);
-        }
-        else
-        {
-            //Load left halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row));
-        }
-
-        if (blockIdx.x + 2 < gridDim.x)
-        {
-            //Load main data
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]);
-
-            //Load right halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]);
-        }
-        else
-        {
-            //Load main data
-            #pragma unroll
-            for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-                smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row));
-
-            //Load right halo
-            #pragma unroll
-            for (int j = 0; j < HALO_SIZE; ++j)
-                smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row));
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int j = 0; j < PATCH_PER_BLOCK; ++j)
-        {
-            const int x = xStart + j * BLOCK_DIM_X;
-
-            if (x < src.cols)
-            {
-                sum_t sum = VecTraits<sum_t>::all(0);
-
-                #pragma unroll
-                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
-
-                dst(y, x) = saturate_cast<D>(sum);
-            }
-        }
-    }
-
-    template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
-    {
-        int BLOCK_DIM_X;
-        int BLOCK_DIM_Y;
-        int PATCH_PER_BLOCK;
-
-        if (cc >= 20)
-        {
-            BLOCK_DIM_X = 32;
-            BLOCK_DIM_Y = 8;
-            PATCH_PER_BLOCK = 4;
-        }
-        else
-        {
-            BLOCK_DIM_X = 32;
-            BLOCK_DIM_Y = 4;
-            PATCH_PER_BLOCK = 4;
-        }
-
-        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-        const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y));
-
-        B<T> brd(src.cols);
-
-        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-namespace filter
-{
-    template <typename T, typename D>
-    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
-
-        static const caller_t callers[5][33] =
-        {
-            {
-                0,
-                row_filter::caller< 1, T, D, BrdRowReflect101>,
-                row_filter::caller< 2, T, D, BrdRowReflect101>,
-                row_filter::caller< 3, T, D, BrdRowReflect101>,
-                row_filter::caller< 4, T, D, BrdRowReflect101>,
-                row_filter::caller< 5, T, D, BrdRowReflect101>,
-                row_filter::caller< 6, T, D, BrdRowReflect101>,
-                row_filter::caller< 7, T, D, BrdRowReflect101>,
-                row_filter::caller< 8, T, D, BrdRowReflect101>,
-                row_filter::caller< 9, T, D, BrdRowReflect101>,
-                row_filter::caller<10, T, D, BrdRowReflect101>,
-                row_filter::caller<11, T, D, BrdRowReflect101>,
-                row_filter::caller<12, T, D, BrdRowReflect101>,
-                row_filter::caller<13, T, D, BrdRowReflect101>,
-                row_filter::caller<14, T, D, BrdRowReflect101>,
-                row_filter::caller<15, T, D, BrdRowReflect101>,
-                row_filter::caller<16, T, D, BrdRowReflect101>,
-                row_filter::caller<17, T, D, BrdRowReflect101>,
-                row_filter::caller<18, T, D, BrdRowReflect101>,
-                row_filter::caller<19, T, D, BrdRowReflect101>,
-                row_filter::caller<20, T, D, BrdRowReflect101>,
-                row_filter::caller<21, T, D, BrdRowReflect101>,
-                row_filter::caller<22, T, D, BrdRowReflect101>,
-                row_filter::caller<23, T, D, BrdRowReflect101>,
-                row_filter::caller<24, T, D, BrdRowReflect101>,
-                row_filter::caller<25, T, D, BrdRowReflect101>,
-                row_filter::caller<26, T, D, BrdRowReflect101>,
-                row_filter::caller<27, T, D, BrdRowReflect101>,
-                row_filter::caller<28, T, D, BrdRowReflect101>,
-                row_filter::caller<29, T, D, BrdRowReflect101>,
-                row_filter::caller<30, T, D, BrdRowReflect101>,
-                row_filter::caller<31, T, D, BrdRowReflect101>,
-                row_filter::caller<32, T, D, BrdRowReflect101>
-            },
-            {
-                0,
-                row_filter::caller< 1, T, D, BrdRowReplicate>,
-                row_filter::caller< 2, T, D, BrdRowReplicate>,
-                row_filter::caller< 3, T, D, BrdRowReplicate>,
-                row_filter::caller< 4, T, D, BrdRowReplicate>,
-                row_filter::caller< 5, T, D, BrdRowReplicate>,
-                row_filter::caller< 6, T, D, BrdRowReplicate>,
-                row_filter::caller< 7, T, D, BrdRowReplicate>,
-                row_filter::caller< 8, T, D, BrdRowReplicate>,
-                row_filter::caller< 9, T, D, BrdRowReplicate>,
-                row_filter::caller<10, T, D, BrdRowReplicate>,
-                row_filter::caller<11, T, D, BrdRowReplicate>,
-                row_filter::caller<12, T, D, BrdRowReplicate>,
-                row_filter::caller<13, T, D, BrdRowReplicate>,
-                row_filter::caller<14, T, D, BrdRowReplicate>,
-                row_filter::caller<15, T, D, BrdRowReplicate>,
-                row_filter::caller<16, T, D, BrdRowReplicate>,
-                row_filter::caller<17, T, D, BrdRowReplicate>,
-                row_filter::caller<18, T, D, BrdRowReplicate>,
-                row_filter::caller<19, T, D, BrdRowReplicate>,
-                row_filter::caller<20, T, D, BrdRowReplicate>,
-                row_filter::caller<21, T, D, BrdRowReplicate>,
-                row_filter::caller<22, T, D, BrdRowReplicate>,
-                row_filter::caller<23, T, D, BrdRowReplicate>,
-                row_filter::caller<24, T, D, BrdRowReplicate>,
-                row_filter::caller<25, T, D, BrdRowReplicate>,
-                row_filter::caller<26, T, D, BrdRowReplicate>,
-                row_filter::caller<27, T, D, BrdRowReplicate>,
-                row_filter::caller<28, T, D, BrdRowReplicate>,
-                row_filter::caller<29, T, D, BrdRowReplicate>,
-                row_filter::caller<30, T, D, BrdRowReplicate>,
-                row_filter::caller<31, T, D, BrdRowReplicate>,
-                row_filter::caller<32, T, D, BrdRowReplicate>
-            },
-            {
-                0,
-                row_filter::caller< 1, T, D, BrdRowConstant>,
-                row_filter::caller< 2, T, D, BrdRowConstant>,
-                row_filter::caller< 3, T, D, BrdRowConstant>,
-                row_filter::caller< 4, T, D, BrdRowConstant>,
-                row_filter::caller< 5, T, D, BrdRowConstant>,
-                row_filter::caller< 6, T, D, BrdRowConstant>,
-                row_filter::caller< 7, T, D, BrdRowConstant>,
-                row_filter::caller< 8, T, D, BrdRowConstant>,
-                row_filter::caller< 9, T, D, BrdRowConstant>,
-                row_filter::caller<10, T, D, BrdRowConstant>,
-                row_filter::caller<11, T, D, BrdRowConstant>,
-                row_filter::caller<12, T, D, BrdRowConstant>,
-                row_filter::caller<13, T, D, BrdRowConstant>,
-                row_filter::caller<14, T, D, BrdRowConstant>,
-                row_filter::caller<15, T, D, BrdRowConstant>,
-                row_filter::caller<16, T, D, BrdRowConstant>,
-                row_filter::caller<17, T, D, BrdRowConstant>,
-                row_filter::caller<18, T, D, BrdRowConstant>,
-                row_filter::caller<19, T, D, BrdRowConstant>,
-                row_filter::caller<20, T, D, BrdRowConstant>,
-                row_filter::caller<21, T, D, BrdRowConstant>,
-                row_filter::caller<22, T, D, BrdRowConstant>,
-                row_filter::caller<23, T, D, BrdRowConstant>,
-                row_filter::caller<24, T, D, BrdRowConstant>,
-                row_filter::caller<25, T, D, BrdRowConstant>,
-                row_filter::caller<26, T, D, BrdRowConstant>,
-                row_filter::caller<27, T, D, BrdRowConstant>,
-                row_filter::caller<28, T, D, BrdRowConstant>,
-                row_filter::caller<29, T, D, BrdRowConstant>,
-                row_filter::caller<30, T, D, BrdRowConstant>,
-                row_filter::caller<31, T, D, BrdRowConstant>,
-                row_filter::caller<32, T, D, BrdRowConstant>
-            },
-            {
-                0,
-                row_filter::caller< 1, T, D, BrdRowReflect>,
-                row_filter::caller< 2, T, D, BrdRowReflect>,
-                row_filter::caller< 3, T, D, BrdRowReflect>,
-                row_filter::caller< 4, T, D, BrdRowReflect>,
-                row_filter::caller< 5, T, D, BrdRowReflect>,
-                row_filter::caller< 6, T, D, BrdRowReflect>,
-                row_filter::caller< 7, T, D, BrdRowReflect>,
-                row_filter::caller< 8, T, D, BrdRowReflect>,
-                row_filter::caller< 9, T, D, BrdRowReflect>,
-                row_filter::caller<10, T, D, BrdRowReflect>,
-                row_filter::caller<11, T, D, BrdRowReflect>,
-                row_filter::caller<12, T, D, BrdRowReflect>,
-                row_filter::caller<13, T, D, BrdRowReflect>,
-                row_filter::caller<14, T, D, BrdRowReflect>,
-                row_filter::caller<15, T, D, BrdRowReflect>,
-                row_filter::caller<16, T, D, BrdRowReflect>,
-                row_filter::caller<17, T, D, BrdRowReflect>,
-                row_filter::caller<18, T, D, BrdRowReflect>,
-                row_filter::caller<19, T, D, BrdRowReflect>,
-                row_filter::caller<20, T, D, BrdRowReflect>,
-                row_filter::caller<21, T, D, BrdRowReflect>,
-                row_filter::caller<22, T, D, BrdRowReflect>,
-                row_filter::caller<23, T, D, BrdRowReflect>,
-                row_filter::caller<24, T, D, BrdRowReflect>,
-                row_filter::caller<25, T, D, BrdRowReflect>,
-                row_filter::caller<26, T, D, BrdRowReflect>,
-                row_filter::caller<27, T, D, BrdRowReflect>,
-                row_filter::caller<28, T, D, BrdRowReflect>,
-                row_filter::caller<29, T, D, BrdRowReflect>,
-                row_filter::caller<30, T, D, BrdRowReflect>,
-                row_filter::caller<31, T, D, BrdRowReflect>,
-                row_filter::caller<32, T, D, BrdRowReflect>
-            },
-            {
-                0,
-                row_filter::caller< 1, T, D, BrdRowWrap>,
-                row_filter::caller< 2, T, D, BrdRowWrap>,
-                row_filter::caller< 3, T, D, BrdRowWrap>,
-                row_filter::caller< 4, T, D, BrdRowWrap>,
-                row_filter::caller< 5, T, D, BrdRowWrap>,
-                row_filter::caller< 6, T, D, BrdRowWrap>,
-                row_filter::caller< 7, T, D, BrdRowWrap>,
-                row_filter::caller< 8, T, D, BrdRowWrap>,
-                row_filter::caller< 9, T, D, BrdRowWrap>,
-                row_filter::caller<10, T, D, BrdRowWrap>,
-                row_filter::caller<11, T, D, BrdRowWrap>,
-                row_filter::caller<12, T, D, BrdRowWrap>,
-                row_filter::caller<13, T, D, BrdRowWrap>,
-                row_filter::caller<14, T, D, BrdRowWrap>,
-                row_filter::caller<15, T, D, BrdRowWrap>,
-                row_filter::caller<16, T, D, BrdRowWrap>,
-                row_filter::caller<17, T, D, BrdRowWrap>,
-                row_filter::caller<18, T, D, BrdRowWrap>,
-                row_filter::caller<19, T, D, BrdRowWrap>,
-                row_filter::caller<20, T, D, BrdRowWrap>,
-                row_filter::caller<21, T, D, BrdRowWrap>,
-                row_filter::caller<22, T, D, BrdRowWrap>,
-                row_filter::caller<23, T, D, BrdRowWrap>,
-                row_filter::caller<24, T, D, BrdRowWrap>,
-                row_filter::caller<25, T, D, BrdRowWrap>,
-                row_filter::caller<26, T, D, BrdRowWrap>,
-                row_filter::caller<27, T, D, BrdRowWrap>,
-                row_filter::caller<28, T, D, BrdRowWrap>,
-                row_filter::caller<29, T, D, BrdRowWrap>,
-                row_filter::caller<30, T, D, BrdRowWrap>,
-                row_filter::caller<31, T, D, BrdRowWrap>,
-                row_filter::caller<32, T, D, BrdRowWrap>
-            }
-        };
-
-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
-    }
-}
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@@ -1,86 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
-#define __OPENCV_CUDA_SAFE_CALL_HPP__
-
-#include <cuda_runtime_api.h>
-#include <cufft.h>
-#include <cublas.h>
-#include "NCV.hpp"
-
-#if defined(__GNUC__)
-    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__, __func__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__, __func__)
-#else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define ncvSafeCall(expr)  ___ncvSafeCall(expr, __FILE__, __LINE__)
-    #define cufftSafeCall(expr)  ___cufftSafeCall(expr, __FILE__, __LINE__)
-    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
-#endif
-
-namespace cv { namespace gpu
-{
-    void ncvError(int err, const char *file, const int line, const char *func = "");
-    void cufftError(int err, const char *file, const int line, const char *func = "");
-    void cublasError(int err, const char *file, const int line, const char *func = "");
-}}
-
-static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-{
-    if (NCV_SUCCESS != err)
-        cv::gpu::ncvError(err, file, line, func);
-}
-
-static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUFFT_SUCCESS != err)
-        cv::gpu::cufftError(err, file, line, func);
-}
-
-static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-{
-    if (CUBLAS_STATUS_SUCCESS != err)
-        cv::gpu::cublasError(err, file, line, func);
-}
-
-#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -1,511 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace split_merge
-    {
-        template <typename T, size_t elem_size = sizeof(T)>
-        struct TypeTraits
-        {
-            typedef T type;
-            typedef T type2;
-            typedef T type3;
-            typedef T type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 1>
-        {
-            typedef char type;
-            typedef char2 type2;
-            typedef char3 type3;
-            typedef char4 type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 2>
-        {
-            typedef short type;
-            typedef short2 type2;
-            typedef short3 type3;
-            typedef short4 type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 4>
-        {
-            typedef int type;
-            typedef int2 type2;
-            typedef int3 type3;
-            typedef int4 type4;
-        };
-
-        template <typename T>
-        struct TypeTraits<T, 8>
-        {
-            typedef double type;
-            typedef double2 type2;
-            //typedef double3 type3;
-            //typedef double4 type3;
-        };
-
-        typedef void (*MergeFunction)(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream);
-        typedef void (*SplitFunction)(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream);
-
-        //------------------------------------------------------------
-        // Merge
-
-        template <typename T>
-        __global__ void mergeC2_(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            typedef typename TypeTraits<T>::type2 dst_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const T* src0_y = (const T*)(src0 + y * src0_step);
-            const T* src1_y = (const T*)(src1 + y * src1_step);
-            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_type dst_elem;
-                dst_elem.x = src0_y[x];
-                dst_elem.y = src1_y[x];
-                dst_y[x] = dst_elem;
-            }
-        }
-
-
-        template <typename T>
-        __global__ void mergeC3_(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            typedef typename TypeTraits<T>::type3 dst_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const T* src0_y = (const T*)(src0 + y * src0_step);
-            const T* src1_y = (const T*)(src1 + y * src1_step);
-            const T* src2_y = (const T*)(src2 + y * src2_step);
-            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_type dst_elem;
-                dst_elem.x = src0_y[x];
-                dst_elem.y = src1_y[x];
-                dst_elem.z = src2_y[x];
-                dst_y[x] = dst_elem;
-            }
-        }
-
-
-        template <>
-        __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double* src0_y = (const double*)(src0 + y * src0_step);
-            const double* src1_y = (const double*)(src1 + y * src1_step);
-            const double* src2_y = (const double*)(src2 + y * src2_step);
-            double* dst_y = (double*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_y[3 * x] = src0_y[x];
-                dst_y[3 * x + 1] = src1_y[x];
-                dst_y[3 * x + 2] = src2_y[x];
-            }
-        }
-
-
-        template <typename T>
-        __global__ void mergeC4_(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 const uchar* src3, size_t src3_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            typedef typename TypeTraits<T>::type4 dst_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const T* src0_y = (const T*)(src0 + y * src0_step);
-            const T* src1_y = (const T*)(src1 + y * src1_step);
-            const T* src2_y = (const T*)(src2 + y * src2_step);
-            const T* src3_y = (const T*)(src3 + y * src3_step);
-            dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_type dst_elem;
-                dst_elem.x = src0_y[x];
-                dst_elem.y = src1_y[x];
-                dst_elem.z = src2_y[x];
-                dst_elem.w = src3_y[x];
-                dst_y[x] = dst_elem;
-            }
-        }
-
-
-        template <>
-        __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step,
-                                 const uchar* src1, size_t src1_step,
-                                 const uchar* src2, size_t src2_step,
-                                 const uchar* src3, size_t src3_step,
-                                 int rows, int cols, uchar* dst, size_t dst_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double* src0_y = (const double*)(src0 + y * src0_step);
-            const double* src1_y = (const double*)(src1 + y * src1_step);
-            const double* src2_y = (const double*)(src2 + y * src2_step);
-            const double* src3_y = (const double*)(src3 + y * src3_step);
-            double2* dst_y = (double2*)(dst + y * dst_step);
-
-            if (x < cols && y < rows)
-            {
-                dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
-                dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
-            }
-        }
-
-
-        template <typename T>
-        static void mergeC2_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC2_<T><<<grid, block, 0, stream>>>(
-                    src[0].data, src[0].step,
-                    src[1].data, src[1].step,
-                    dst.rows, dst.cols, dst.data, dst.step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void mergeC3_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC3_<T><<<grid, block, 0, stream>>>(
-                    src[0].data, src[0].step,
-                    src[1].data, src[1].step,
-                    src[2].data, src[2].step,
-                    dst.rows, dst.cols, dst.data, dst.step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void mergeC4_(const PtrStepSzb* src, PtrStepSzb& dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC4_<T><<<grid, block, 0, stream>>>(
-                    src[0].data, src[0].step,
-                    src[1].data, src[1].step,
-                    src[2].data, src[2].step,
-                    src[3].data, src[3].step,
-                    dst.rows, dst.cols, dst.data, dst.step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void merge_caller(const PtrStepSzb* src, PtrStepSzb& dst,
-                                     int total_channels, size_t elem_size,
-                                     const cudaStream_t& stream)
-        {
-            static MergeFunction merge_func_tbl[] =
-            {
-                mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
-                mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
-                mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
-            };
-
-            size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
-            MergeFunction merge_func = merge_func_tbl[merge_func_id];
-
-            if (merge_func == 0)
-                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
-
-            merge_func(src, dst, stream);
-        }
-
-
-
-        //------------------------------------------------------------
-        // Split
-
-
-        template <typename T>
-        __global__ void splitC2_(const uchar* src, size_t src_step,
-                                int rows, int cols,
-                                uchar* dst0, size_t dst0_step,
-                                uchar* dst1, size_t dst1_step)
-        {
-            typedef typename TypeTraits<T>::type2 src_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const src_type* src_y = (const src_type*)(src + y * src_step);
-            T* dst0_y = (T*)(dst0 + y * dst0_step);
-            T* dst1_y = (T*)(dst1 + y * dst1_step);
-
-            if (x < cols && y < rows)
-            {
-                src_type src_elem = src_y[x];
-                dst0_y[x] = src_elem.x;
-                dst1_y[x] = src_elem.y;
-            }
-        }
-
-
-        template <typename T>
-        __global__ void splitC3_(const uchar* src, size_t src_step,
-                                int rows, int cols,
-                                uchar* dst0, size_t dst0_step,
-                                uchar* dst1, size_t dst1_step,
-                                uchar* dst2, size_t dst2_step)
-        {
-            typedef typename TypeTraits<T>::type3 src_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const src_type* src_y = (const src_type*)(src + y * src_step);
-            T* dst0_y = (T*)(dst0 + y * dst0_step);
-            T* dst1_y = (T*)(dst1 + y * dst1_step);
-            T* dst2_y = (T*)(dst2 + y * dst2_step);
-
-            if (x < cols && y < rows)
-            {
-                src_type src_elem = src_y[x];
-                dst0_y[x] = src_elem.x;
-                dst1_y[x] = src_elem.y;
-                dst2_y[x] = src_elem.z;
-            }
-        }
-
-
-        template <>
-        __global__ void splitC3_<double>(
-                const uchar* src, size_t src_step, int rows, int cols,
-                uchar* dst0, size_t dst0_step,
-                uchar* dst1, size_t dst1_step,
-                uchar* dst2, size_t dst2_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double* src_y = (const double*)(src + y * src_step);
-            double* dst0_y = (double*)(dst0 + y * dst0_step);
-            double* dst1_y = (double*)(dst1 + y * dst1_step);
-            double* dst2_y = (double*)(dst2 + y * dst2_step);
-
-            if (x < cols && y < rows)
-            {
-                dst0_y[x] = src_y[3 * x];
-                dst1_y[x] = src_y[3 * x + 1];
-                dst2_y[x] = src_y[3 * x + 2];
-            }
-        }
-
-
-        template <typename T>
-        __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
-                                uchar* dst0, size_t dst0_step,
-                                uchar* dst1, size_t dst1_step,
-                                uchar* dst2, size_t dst2_step,
-                                uchar* dst3, size_t dst3_step)
-        {
-            typedef typename TypeTraits<T>::type4 src_type;
-
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const src_type* src_y = (const src_type*)(src + y * src_step);
-            T* dst0_y = (T*)(dst0 + y * dst0_step);
-            T* dst1_y = (T*)(dst1 + y * dst1_step);
-            T* dst2_y = (T*)(dst2 + y * dst2_step);
-            T* dst3_y = (T*)(dst3 + y * dst3_step);
-
-            if (x < cols && y < rows)
-            {
-                src_type src_elem = src_y[x];
-                dst0_y[x] = src_elem.x;
-                dst1_y[x] = src_elem.y;
-                dst2_y[x] = src_elem.z;
-                dst3_y[x] = src_elem.w;
-            }
-        }
-
-
-        template <>
-        __global__ void splitC4_<double>(
-                const uchar* src, size_t src_step, int rows, int cols,
-                uchar* dst0, size_t dst0_step,
-                uchar* dst1, size_t dst1_step,
-                uchar* dst2, size_t dst2_step,
-                uchar* dst3, size_t dst3_step)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            const double2* src_y = (const double2*)(src + y * src_step);
-            double* dst0_y = (double*)(dst0 + y * dst0_step);
-            double* dst1_y = (double*)(dst1 + y * dst1_step);
-            double* dst2_y = (double*)(dst2 + y * dst2_step);
-            double* dst3_y = (double*)(dst3 + y * dst3_step);
-
-            if (x < cols && y < rows)
-            {
-                double2 src_elem1 = src_y[2 * x];
-                double2 src_elem2 = src_y[2 * x + 1];
-                dst0_y[x] = src_elem1.x;
-                dst1_y[x] = src_elem1.y;
-                dst2_y[x] = src_elem2.x;
-                dst3_y[x] = src_elem2.y;
-            }
-        }
-
-        template <typename T>
-        static void splitC2_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC2_<T><<<grid, block, 0, stream>>>(
-                    src.data, src.step, src.rows, src.cols,
-                    dst[0].data, dst[0].step,
-                    dst[1].data, dst[1].step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void splitC3_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC3_<T><<<grid, block, 0, stream>>>(
-                    src.data, src.step, src.rows, src.cols,
-                    dst[0].data, dst[0].step,
-                    dst[1].data, dst[1].step,
-                    dst[2].data, dst[2].step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        template <typename T>
-        static void splitC4_(const PtrStepSzb& src, PtrStepSzb* dst, const cudaStream_t& stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC4_<T><<<grid, block, 0, stream>>>(
-                     src.data, src.step, src.rows, src.cols,
-                     dst[0].data, dst[0].step,
-                     dst[1].data, dst[1].step,
-                     dst[2].data, dst[2].step,
-                     dst[3].data, dst[3].step);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
-
-
-        void split_caller(const PtrStepSzb& src, PtrStepSzb* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
-        {
-            static SplitFunction split_func_tbl[] =
-            {
-                splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
-                splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
-                splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
-            };
-
-            size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
-            SplitFunction split_func = split_func_tbl[split_func_id];
-
-            if (split_func == 0)
-                CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported channel count or data type");
-
-            split_func(src, dst, stream);
-        }
-    } // namespace split_merge
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@@ -1,540 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereobm
-    {
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-
-        #define ROWSperTHREAD 21     // the number of rows a thread will process
-
-        #define BLOCK_W 128          // the thread block width (464)
-        #define N_DISPARITIES 8
-
-        #define STEREO_MIND 0                    // The minimum d range to check
-        #define STEREO_DISP_STEP N_DISPARITIES   // the d step, must be <= 1 to avoid aliasing
-
-        __constant__ unsigned int* cminSSDImage;
-        __constant__ size_t cminSSD_step;
-        __constant__ int cwidth;
-        __constant__ int cheight;
-
-        __device__ __forceinline__ int SQ(int a)
-        {
-            return a * a;
-        }
-
-        template<int RADIUS>
-        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-        {
-            unsigned int cache = 0;
-            unsigned int cache2 = 0;
-
-            for(int i = 1; i <= RADIUS; i++)
-                cache += col_ssd[i];
-
-            col_ssd_cache[0] = cache;
-
-            __syncthreads();
-
-            if (threadIdx.x < BLOCK_W - RADIUS)
-                cache2 = col_ssd_cache[RADIUS];
-            else
-                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
-                    cache2 += col_ssd[i];
-
-            return col_ssd[0] + cache + cache2;
-        }
-
-        template<int RADIUS>
-        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
-        {
-            unsigned int ssd[N_DISPARITIES];
-
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
-            __syncthreads();
-            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
-
-            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));
-
-            int bestIdx = 0;
-            for (int i = 0; i < N_DISPARITIES; i++)
-            {
-                if (mssd == ssd[i])
-                    bestIdx = i;
-            }
-
-            return make_uint2(mssd, bestIdx);
-        }
-
-        template<int RADIUS>
-        __device__ void StepDown(int idx1, int idx2, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-        {
-            unsigned char leftPixel1;
-            unsigned char leftPixel2;
-            unsigned char rightPixel1[8];
-            unsigned char rightPixel2[8];
-            unsigned int diff1, diff2;
-
-            leftPixel1 = imageL[idx1];
-            leftPixel2 = imageL[idx2];
-
-            idx1 = idx1 - d;
-            idx2 = idx2 - d;
-
-            rightPixel1[7] = imageR[idx1 - 7];
-            rightPixel1[0] = imageR[idx1 - 0];
-            rightPixel1[1] = imageR[idx1 - 1];
-            rightPixel1[2] = imageR[idx1 - 2];
-            rightPixel1[3] = imageR[idx1 - 3];
-            rightPixel1[4] = imageR[idx1 - 4];
-            rightPixel1[5] = imageR[idx1 - 5];
-            rightPixel1[6] = imageR[idx1 - 6];
-
-            rightPixel2[7] = imageR[idx2 - 7];
-            rightPixel2[0] = imageR[idx2 - 0];
-            rightPixel2[1] = imageR[idx2 - 1];
-            rightPixel2[2] = imageR[idx2 - 2];
-            rightPixel2[3] = imageR[idx2 - 3];
-            rightPixel2[4] = imageR[idx2 - 4];
-            rightPixel2[5] = imageR[idx2 - 5];
-            rightPixel2[6] = imageR[idx2 - 6];
-
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            diff1 = leftPixel1 - rightPixel1[0];
-            diff2 = leftPixel2 - rightPixel2[0];
-            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[1];
-            diff2 = leftPixel2 - rightPixel2[1];
-            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[2];
-            diff2 = leftPixel2 - rightPixel2[2];
-            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[3];
-            diff2 = leftPixel2 - rightPixel2[3];
-            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[4];
-            diff2 = leftPixel2 - rightPixel2[4];
-            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[5];
-            diff2 = leftPixel2 - rightPixel2[5];
-            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[6];
-            diff2 = leftPixel2 - rightPixel2[6];
-            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-
-            diff1 = leftPixel1 - rightPixel1[7];
-            diff2 = leftPixel2 - rightPixel2[7];
-            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] += SQ(diff2) - SQ(diff1);
-        }
-
-        template<int RADIUS>
-        __device__ void InitColSSD(int x_tex, int y_tex, int im_pitch, unsigned char* imageL, unsigned char* imageR, int d, volatile unsigned int *col_ssd)
-        {
-            unsigned char leftPixel1;
-            int idx;
-            unsigned int diffa[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
-            for(int i = 0; i < (2 * RADIUS + 1); i++)
-            {
-                idx = y_tex * im_pitch + x_tex;
-                leftPixel1 = imageL[idx];
-                idx = idx - d;
-
-                diffa[0] += SQ(leftPixel1 - imageR[idx - 0]);
-                diffa[1] += SQ(leftPixel1 - imageR[idx - 1]);
-                diffa[2] += SQ(leftPixel1 - imageR[idx - 2]);
-                diffa[3] += SQ(leftPixel1 - imageR[idx - 3]);
-                diffa[4] += SQ(leftPixel1 - imageR[idx - 4]);
-                diffa[5] += SQ(leftPixel1 - imageR[idx - 5]);
-                diffa[6] += SQ(leftPixel1 - imageR[idx - 6]);
-                diffa[7] += SQ(leftPixel1 - imageR[idx - 7]);
-
-                y_tex += 1;
-            }
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            col_ssd[0 * (BLOCK_W + 2 * RADIUS)] = diffa[0];
-            col_ssd[1 * (BLOCK_W + 2 * RADIUS)] = diffa[1];
-            col_ssd[2 * (BLOCK_W + 2 * RADIUS)] = diffa[2];
-            col_ssd[3 * (BLOCK_W + 2 * RADIUS)] = diffa[3];
-            col_ssd[4 * (BLOCK_W + 2 * RADIUS)] = diffa[4];
-            col_ssd[5 * (BLOCK_W + 2 * RADIUS)] = diffa[5];
-            col_ssd[6 * (BLOCK_W + 2 * RADIUS)] = diffa[6];
-            col_ssd[7 * (BLOCK_W + 2 * RADIUS)] = diffa[7];
-        }
-
-        template<int RADIUS>
-        __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t img_step, PtrStepb disp, int maxdisp)
-        {
-            extern __shared__ unsigned int col_ssd_cache[];
-            volatile unsigned int *col_ssd = col_ssd_cache + BLOCK_W + threadIdx.x;
-            volatile unsigned int *col_ssd_extra = threadIdx.x < (2 * RADIUS) ? col_ssd + BLOCK_W : 0;  //#define N_DIRTY_PIXELS (2 * RADIUS)
-
-            //#define X (blockIdx.x * BLOCK_W + threadIdx.x + STEREO_MAXD)
-            int X = (blockIdx.x * BLOCK_W + threadIdx.x + maxdisp + RADIUS);
-            //#define Y (__mul24(blockIdx.y, ROWSperTHREAD) + RADIUS)
-            #define Y (blockIdx.y * ROWSperTHREAD + RADIUS)
-            //int Y = blockIdx.y * ROWSperTHREAD + RADIUS;
-
-            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
-            unsigned char* disparImage = disp.data + X + Y * disp.step;
-         /*   if (X < cwidth)
-            {
-                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-                    *ptr = 0xFFFFFFFF;
-            }*/
-            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
-            int y_tex;
-            int x_tex = X - RADIUS;
-
-            if (x_tex >= cwidth)
-                return;
-
-            for(int d = STEREO_MIND; d < maxdisp; d += STEREO_DISP_STEP)
-            {
-                y_tex = Y - RADIUS;
-
-                InitColSSD<RADIUS>(x_tex, y_tex, img_step, left, right, d, col_ssd);
-
-                if (col_ssd_extra > 0)
-                    if (x_tex + BLOCK_W < cwidth)
-                        InitColSSD<RADIUS>(x_tex + BLOCK_W, y_tex, img_step, left, right, d, col_ssd_extra);
-
-                __syncthreads(); //before MinSSD function
-
-                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
-                {
-                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                    if (minSSD.x < minSSDImage[0])
-                    {
-                        disparImage[0] = (unsigned char)(d + minSSD.y);
-                        minSSDImage[0] = minSSD.x;
-                    }
-                }
-
-                for(int row = 1; row < end_row; row++)
-                {
-                    int idx1 = y_tex * img_step + x_tex;
-                    int idx2 = (y_tex + (2 * RADIUS + 1)) * img_step + x_tex;
-
-                    __syncthreads();
-
-                    StepDown<RADIUS>(idx1, idx2, left, right, d, col_ssd);
-
-                    if (col_ssd_extra)
-                        if (x_tex + BLOCK_W < cwidth)
-                            StepDown<RADIUS>(idx1, idx2, left + BLOCK_W, right + BLOCK_W, d, col_ssd_extra);
-
-                    y_tex += 1;
-
-                    __syncthreads(); //before MinSSD function
-
-                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
-                    {
-                        int idx = row * cminSSD_step;
-                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                        if (minSSD.x < minSSDImage[idx])
-                        {
-                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                            minSSDImage[idx] = minSSD.x;
-                        }
-                    }
-                } // for row loop
-            } // for d loop
-        }
-
-
-        template<int RADIUS> void kernel_caller(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream)
-        {
-            dim3 grid(1,1,1);
-            dim3 threads(BLOCK_W, 1, 1);
-
-            grid.x = divUp(left.cols - maxdisp - 2 * RADIUS, BLOCK_W);
-            grid.y = divUp(left.rows - 2 * RADIUS, ROWSperTHREAD);
-
-            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
-
-            stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        };
-
-        typedef void (*kernel_caller_t)(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, cudaStream_t & stream);
-
-        const static kernel_caller_t callers[] =
-        {
-            0,
-            kernel_caller< 1>, kernel_caller< 2>, kernel_caller< 3>, kernel_caller< 4>, kernel_caller< 5>,
-            kernel_caller< 6>, kernel_caller< 7>, kernel_caller< 8>, kernel_caller< 9>, kernel_caller<10>,
-            kernel_caller<11>, kernel_caller<12>, kernel_caller<13>, kernel_caller<15>, kernel_caller<15>,
-            kernel_caller<16>, kernel_caller<17>, kernel_caller<18>, kernel_caller<19>, kernel_caller<20>,
-            kernel_caller<21>, kernel_caller<22>, kernel_caller<23>, kernel_caller<24>, kernel_caller<25>
-
-            //0,0,0, 0,0,0, 0,0,kernel_caller<9>
-        };
-        const int calles_num = sizeof(callers)/sizeof(callers[0]);
-
-        void stereoBM_GPU(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& disp, int maxdisp, int winsz, const PtrStepSz<unsigned int>& minSSD_buf, cudaStream_t& stream)
-        {
-            int winsz2 = winsz >> 1;
-
-            if (winsz2 == 0 || winsz2 >= calles_num)
-                CV_Error(cv::Error::StsBadArg, "Unsupported window size");
-
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferL1) );
-            //cudaSafeCall( cudaFuncSetCacheConfig(&stereoKernel, cudaFuncCachePreferShared) );
-
-            cudaSafeCall( cudaMemset2D(disp.data, disp.step, 0, disp.cols, disp.rows) );
-            cudaSafeCall( cudaMemset2D(minSSD_buf.data, minSSD_buf.step, 0xFF, minSSD_buf.cols * minSSD_buf.elemSize(), disp.rows) );
-
-            cudaSafeCall( cudaMemcpyToSymbol( cwidth, &left.cols, sizeof(left.cols) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cheight, &left.rows, sizeof(left.rows) ) );
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSDImage, &minSSD_buf.data, sizeof(minSSD_buf.data) ) );
-
-            size_t minssd_step = minSSD_buf.step/minSSD_buf.elemSize();
-            cudaSafeCall( cudaMemcpyToSymbol( cminSSD_step,  &minssd_step, sizeof(minssd_step) ) );
-
-            callers[winsz2](left, right, disp, maxdisp, stream);
-        }
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////////// Sobel Prefiler ///////////////////////////////////////////
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-
-        texture<unsigned char, 2, cudaReadModeElementType> texForSobel;
-
-        __global__ void prefilter_kernel(PtrStepSzb output, int prefilterCap)
-        {
-            int x = blockDim.x * blockIdx.x + threadIdx.x;
-            int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < output.cols && y < output.rows)
-            {
-                int conv = (int)tex2D(texForSobel, x - 1, y - 1) * (-1) + (int)tex2D(texForSobel, x + 1, y - 1) * (1) +
-                           (int)tex2D(texForSobel, x - 1, y    ) * (-2) + (int)tex2D(texForSobel, x + 1, y    ) * (2) +
-                           (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);
-
-
-                conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
-                output.ptr(y)[x] = conv & 0xFF;
-            }
-        }
-
-        void prefilter_xsobel(const PtrStepSzb& input, const PtrStepSzb& output, int prefilterCap, cudaStream_t & stream)
-        {
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
-
-            dim3 threads(16, 16, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(input.cols, threads.x);
-            grid.y = divUp(input.rows, threads.y);
-
-            prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture (texForSobel ) );
-        }
-
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
-        //////////////////////////////////////////////////////////////////////////////////////////////////
-
-        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
-
-        __device__ __forceinline__ float sobel(int x, int y)
-        {
-            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
-                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
-                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
-            return fabs(conv);
-        }
-
-        __device__ float CalcSums(float *cols, float *cols_cache, int winsz)
-        {
-            float cache = 0;
-            float cache2 = 0;
-            int winsz2 = winsz/2;
-
-            for(int i = 1; i <= winsz2; i++)
-                cache += cols[i];
-
-            cols_cache[0] = cache;
-
-            __syncthreads();
-
-            if (threadIdx.x < blockDim.x - winsz2)
-                cache2 = cols_cache[winsz2];
-            else
-                for(int i = winsz2 + 1; i < winsz; i++)
-                    cache2 += cols[i];
-
-            return cols[0] + cache + cache2;
-        }
-
-        #define RpT (2 * ROWSperTHREAD)  // got experimentally
-
-        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
-        {
-            int winsz2 = winsz/2;
-            int n_dirty_pixels = (winsz2) * 2;
-
-            extern __shared__ float cols_cache[];
-            float *cols = cols_cache + blockDim.x + threadIdx.x;
-            float *cols_extra = threadIdx.x < n_dirty_pixels ? cols + blockDim.x : 0;
-
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int beg_row = blockIdx.y * RpT;
-            int end_row = ::min(beg_row + RpT, disp.rows);
-
-            if (x < disp.cols)
-            {
-                int y = beg_row;
-
-                float sum = 0;
-                float sum_extra = 0;
-
-                for(int i = y - winsz2; i <= y + winsz2; ++i)
-                {
-                    sum += sobel(x - winsz2, i);
-                    if (cols_extra)
-                        sum_extra += sobel(x + blockDim.x - winsz2, i);
-                }
-                *cols = sum;
-                if (cols_extra)
-                    *cols_extra = sum_extra;
-
-                __syncthreads();
-
-                float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-                if (sum_win < threshold)
-                    disp.data[y * disp.step + x] = 0;
-
-                __syncthreads();
-
-                for(int y = beg_row + 1; y < end_row; ++y)
-                {
-                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
-                    *cols = sum;
-
-                    if (cols_extra)
-                    {
-                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
-                        *cols_extra = sum_extra;
-                    }
-
-                    __syncthreads();
-                    float sum_win = CalcSums(cols, cols_cache + threadIdx.x, winsz) * 255;
-                    if (sum_win < threshold)
-                        disp.data[y * disp.step + x] = 0;
-
-                    __syncthreads();
-                }
-            }
-        }
-
-        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
-        {
-            avgTexturenessThreshold *= winsz * winsz;
-
-            texForTF.filterMode     = cudaFilterModeLinear;
-            texForTF.addressMode[0] = cudaAddressModeWrap;
-            texForTF.addressMode[1] = cudaAddressModeWrap;
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
-
-            dim3 threads(128, 1, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(input.cols, threads.x);
-            grid.y = divUp(input.rows, RpT);
-
-            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture (texForTF) );
-        }
-    } // namespace stereobm
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -1,538 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereobp
-    {
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// load constants ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        __constant__ int   cndisp;
-        __constant__ float cmax_data_term;
-        __constant__ float cdata_weight;
-        __constant__ float cmax_disc_term;
-        __constant__ float cdisc_single_jump;
-
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////////// comp data //////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <int cn> struct PixDiff;
-        template <> struct PixDiff<1>
-        {
-            __device__ __forceinline__ PixDiff(const uchar* ls)
-            {
-                l = *ls;
-            }
-            __device__ __forceinline__ float operator()(const uchar* rs) const
-            {
-                return ::abs((int)l - *rs);
-            }
-            uchar l;
-        };
-        template <> struct PixDiff<3>
-        {
-            __device__ __forceinline__ PixDiff(const uchar* ls)
-            {
-                l = *((uchar3*)ls);
-            }
-            __device__ __forceinline__ float operator()(const uchar* rs) const
-            {
-                const float tr = 0.299f;
-                const float tg = 0.587f;
-                const float tb = 0.114f;
-
-                float val  = tb * ::abs((int)l.x - rs[0]);
-                      val += tg * ::abs((int)l.y - rs[1]);
-                      val += tr * ::abs((int)l.z - rs[2]);
-
-                return val;
-            }
-            uchar3 l;
-        };
-        template <> struct PixDiff<4>
-        {
-            __device__ __forceinline__ PixDiff(const uchar* ls)
-            {
-                l = *((uchar4*)ls);
-            }
-            __device__ __forceinline__ float operator()(const uchar* rs) const
-            {
-                const float tr = 0.299f;
-                const float tg = 0.587f;
-                const float tb = 0.114f;
-
-                uchar4 r = *((uchar4*)rs);
-
-                float val  = tb * ::abs((int)l.x - r.x);
-                      val += tg * ::abs((int)l.y - r.y);
-                      val += tr * ::abs((int)l.z - r.z);
-
-                return val;
-            }
-            uchar4 l;
-        };
-
-        template <int cn, typename D>
-        __global__ void comp_data(const PtrStepSzb left, const PtrStepb right, PtrStep<D> data)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
-            {
-                const uchar* ls = left.ptr(y) + x * cn;
-                const PixDiff<cn> pixDiff(ls);
-                const uchar* rs = right.ptr(y) + x * cn;
-
-                D* ds = data.ptr(y) + x;
-                const size_t disp_step = data.step * left.rows / sizeof(D);
-
-                for (int disp = 0; disp < cndisp; disp++)
-                {
-                    if (x - disp >= 1)
-                    {
-                        float val = pixDiff(rs - disp * cn);
-
-                        ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
-                    }
-                    else
-                    {
-                        ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
-                    }
-                }
-            }
-        }
-
-        template<typename T, typename D>
-        void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
-
-        template <> void comp_data_gpu<uchar, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <> void comp_data_gpu<uchar, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <> void comp_data_gpu<uchar3, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <> void comp_data_gpu<uchar3, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template <> void comp_data_gpu<uchar4, short>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<short>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-        template <> void comp_data_gpu<uchar4, float>(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(left.cols, threads.x);
-            grid.y = divUp(left.rows, threads.y);
-
-            comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (PtrStepSz<float>)data);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        //////////////////////// data step down ///////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst_cols && y < dst_rows)
-            {
-                for (int d = 0; d < cndisp; ++d)
-                {
-                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
-
-                    dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
-                }
-            }
-        }
-
-        template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(dst_cols, threads.x);
-            grid.y = divUp(dst_rows, threads.y);
-
-            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////// level up messages  ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x < dst_cols && y < dst_rows)
-            {
-                const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
-                const size_t src_disp_step = src.step * src_rows / sizeof(T);
-
-                T*       dstr = dst.ptr(y  ) + x;
-                const T* srcr = src.ptr(y/2) + x/2;
-
-                for (int d = 0; d < cndisp; ++d)
-                    dstr[d * dst_disp_step] = srcr[d * src_disp_step];
-            }
-        }
-
-        template <typename T>
-        void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(dst_cols, threads.x);
-            grid.y = divUp(dst_rows, threads.y);
-
-            int src_idx = (dst_idx + 1) & 1;
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mus[src_idx], (PtrStepSz<T>)mus[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mds[src_idx], (PtrStepSz<T>)mds[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mls[src_idx], (PtrStepSz<T>)mls[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)mrs[src_idx], (PtrStepSz<T>)mrs[dst_idx]);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-        template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////  calc all iterations /////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __device__ void calc_min_linear_penalty(T* dst, size_t step)
-        {
-            float prev = dst[0];
-            float cur;
-            for (int disp = 1; disp < cndisp; ++disp)
-            {
-                prev += cdisc_single_jump;
-                cur = dst[step * disp];
-                if (prev < cur)
-                {
-                    cur = prev;
-                    dst[step * disp] = saturate_cast<T>(prev);
-                }
-                prev = cur;
-            }
-
-            prev = dst[(cndisp - 1) * step];
-            for (int disp = cndisp - 2; disp >= 0; disp--)
-            {
-                prev += cdisc_single_jump;
-                cur = dst[step * disp];
-                if (prev < cur)
-                {
-                    cur = prev;
-                    dst[step * disp] = saturate_cast<T>(prev);
-                }
-                prev = cur;
-            }
-        }
-
-        template <typename T>
-        __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
-        {
-            float minimum = cudev::numeric_limits<float>::max();
-
-            for(int i = 0; i < cndisp; ++i)
-            {
-                float dst_reg  = msg1[msg_disp_step * i];
-                      dst_reg += msg2[msg_disp_step * i];
-                      dst_reg += msg3[msg_disp_step * i];
-                      dst_reg += data[data_disp_step * i];
-
-                if (dst_reg < minimum)
-                    minimum = dst_reg;
-
-                dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
-            }
-
-            calc_min_linear_penalty(dst, msg_disp_step);
-
-            minimum += cmax_disc_term;
-
-            float sum = 0;
-            for(int i = 0; i < cndisp; ++i)
-            {
-                float dst_reg = dst[msg_disp_step * i];
-                if (dst_reg > minimum)
-                {
-                    dst_reg = minimum;
-                    dst[msg_disp_step * i] = saturate_cast<T>(minimum);
-                }
-                sum += dst_reg;
-            }
-            sum /= cndisp;
-
-            for(int i = 0; i < cndisp; ++i)
-                dst[msg_disp_step * i] -= sum;
-        }
-
-        template <typename T>
-        __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
-        {
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-            const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
-
-            if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
-            {
-                T* us = u + y * elem_step + x;
-                T* ds = d + y * elem_step + x;
-                T* ls = l + y * elem_step + x;
-                T* rs = r + y * elem_step + x;
-                const T* dt = data.ptr(y) + x;
-
-                size_t msg_disp_step = elem_step * rows;
-                size_t data_disp_step = data.step * rows / sizeof(T);
-
-                message(us + elem_step, ls         + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
-                message(ds - elem_step, ls         + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
-                message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
-                message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
-            }
-        }
-
-        template <typename T>
-        void calc_all_iterations_gpu(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d,
-            const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(cols, threads.x << 1);
-            grid.y = divUp(rows, threads.y);
-
-            int elem_step = (int)(u.step / sizeof(T));
-
-            for(int t = 0; t < iters; ++t)
-            {
-                one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        }
-
-        template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-        template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////////////// output ////////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
-            PtrStepSz<short> disp)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-            {
-                const T* us = u + (y + 1) * elem_step + x;
-                const T* ds = d + (y - 1) * elem_step + x;
-                const T* ls = l + y * elem_step + (x + 1);
-                const T* rs = r + y * elem_step+ (x - 1);
-                const T* dt = data + y * elem_step + x;
-
-                size_t disp_step = disp.rows * elem_step;
-
-                int best = 0;
-                float best_val = numeric_limits<float>::max();
-                for (int d = 0; d < cndisp; ++d)
-                {
-                    float val  = us[d * disp_step];
-                          val += ds[d * disp_step];
-                          val += ls[d * disp_step];
-                          val += rs[d * disp_step];
-                          val += dt[d * disp_step];
-
-                    if (val < best_val)
-                    {
-                        best_val = val;
-                        best = d;
-                    }
-                }
-
-                disp.ptr(y)[x] = saturate_cast<short>(best);
-            }
-        }
-
-        template <typename T>
-        void output_gpu(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data,
-            const PtrStepSz<short>& disp, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(disp.cols, threads.x);
-            grid.y = divUp(disp.rows, threads.y);
-
-            int elem_step = static_cast<int>(u.step/sizeof(T));
-
-            output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void output_gpu<short>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-        template void output_gpu<float>(const PtrStepSzb& u, const PtrStepSzb& d, const PtrStepSzb& l, const PtrStepSzb& r, const PtrStepSzb& data, const PtrStepSz<short>& disp, cudaStream_t stream);
-    } // namespace stereobp
-}}} // namespace cv { namespace gpu { namespace cudev
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@@ -1,864 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace stereocsbp
-    {
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// load constants ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        __constant__ int cndisp;
-
-        __constant__ float cmax_data_term;
-        __constant__ float cdata_weight;
-        __constant__ float cmax_disc_term;
-        __constant__ float cdisc_single_jump;
-
-        __constant__ int cth;
-
-        __constant__ size_t cimg_step;
-        __constant__ size_t cmsg_step;
-        __constant__ size_t cdisp_step1;
-        __constant__ size_t cdisp_step2;
-
-        __constant__ uchar* cleft;
-        __constant__ uchar* cright;
-        __constant__ uchar* ctemp;
-
-
-        void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
-                            const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& temp)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cth, &min_disp_th, sizeof(int)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cimg_step, &left.step, sizeof(size_t)) );
-
-            cudaSafeCall( cudaMemcpyToSymbol(cleft,  &left.data,  sizeof(left.data)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cright, &right.data, sizeof(right.data)) );
-            cudaSafeCall( cudaMemcpyToSymbol(ctemp, &temp.data, sizeof(temp.data)) );
-        }
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////////// init data cost ////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <int channels> struct DataCostPerPixel;
-        template <> struct DataCostPerPixel<1>
-        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                return fmin(cdata_weight * ::abs((int)*left - *right), cdata_weight * cmax_data_term);
-            }
-        };
-        template <> struct DataCostPerPixel<3>
-        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                float tb = 0.114f * ::abs((int)left[0] - right[0]);
-                float tg = 0.587f * ::abs((int)left[1] - right[1]);
-                float tr = 0.299f * ::abs((int)left[2] - right[2]);
-
-                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-            }
-        };
-        template <> struct DataCostPerPixel<4>
-        {
-            static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
-            {
-                uchar4 l = *((const uchar4*)left);
-                uchar4 r = *((const uchar4*)right);
-
-                float tb = 0.114f * ::abs((int)l.x - r.x);
-                float tg = 0.587f * ::abs((int)l.y - r.y);
-                float tr = 0.299f * ::abs((int)l.z - r.z);
-
-                return fmin(cdata_weight * (tr + tg + tb), cdata_weight * cmax_data_term);
-            }
-        };
-
-        template <typename T>
-        __global__ void get_first_k_initial_global(T* data_cost_selected_, T *selected_disp_pyr, int h, int w, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
-
-                for(int i = 0; i < nr_plane; i++)
-                {
-                    T minimum = cudev::numeric_limits<T>::max();
-                    int id = 0;
-                    for(int d = 0; d < cndisp; d++)
-                    {
-                        T cur = data_cost[d * cdisp_step1];
-                        if(cur < minimum)
-                        {
-                            minimum = cur;
-                            id = d;
-                        }
-                    }
-
-                    data_cost_selected[i  * cdisp_step1] = minimum;
-                    selected_disparity[i  * cdisp_step1] = id;
-                    data_cost         [id * cdisp_step1] = numeric_limits<T>::max();
-                }
-            }
-        }
-
-
-        template <typename T>
-        __global__ void get_first_k_initial_local(T* data_cost_selected_, T* selected_disp_pyr, int h, int w, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                T* selected_disparity = selected_disp_pyr + y * cmsg_step + x;
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
-
-                int nr_local_minimum = 0;
-
-                T prev = data_cost[0 * cdisp_step1];
-                T cur  = data_cost[1 * cdisp_step1];
-                T next = data_cost[2 * cdisp_step1];
-
-                for (int d = 1; d < cndisp - 1 && nr_local_minimum < nr_plane; d++)
-                {
-                    if (cur < prev && cur < next)
-                    {
-                        data_cost_selected[nr_local_minimum * cdisp_step1] = cur;
-                        selected_disparity[nr_local_minimum * cdisp_step1] = d;
-
-                        data_cost[d * cdisp_step1] = numeric_limits<T>::max();
-
-                        nr_local_minimum++;
-                    }
-                    prev = cur;
-                    cur = next;
-                    next = data_cost[(d + 1) * cdisp_step1];
-                }
-
-                for (int i = nr_local_minimum; i < nr_plane; i++)
-                {
-                    T minimum = numeric_limits<T>::max();
-                    int id = 0;
-
-                    for (int d = 0; d < cndisp; d++)
-                    {
-                        cur = data_cost[d * cdisp_step1];
-                        if (cur < minimum)
-                        {
-                            minimum = cur;
-                            id = d;
-                        }
-                    }
-                    data_cost_selected[i * cdisp_step1] = minimum;
-                    selected_disparity[i * cdisp_step1] = id;
-
-                    data_cost[id * cdisp_step1] = numeric_limits<T>::max();
-                }
-            }
-        }
-
-        template <typename T, int channels>
-        __global__ void init_data_cost(int h, int w, int level)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                int y0 = y << level;
-                int yt = (y + 1) << level;
-
-                int x0 = x << level;
-                int xt = (x + 1) << level;
-
-                T* data_cost = (T*)ctemp + y * cmsg_step + x;
-
-                for(int d = 0; d < cndisp; ++d)
-                {
-                    float val = 0.0f;
-                    for(int yi = y0; yi < yt; yi++)
-                    {
-                        for(int xi = x0; xi < xt; xi++)
-                        {
-                            int xr = xi - d;
-                            if(d < cth || xr < 0)
-                                val += cdata_weight * cmax_data_term;
-                            else
-                            {
-                                const uchar* lle = cleft + yi * cimg_step + xi * channels;
-                                const uchar* lri = cright + yi * cimg_step + xr * channels;
-
-                                val += DataCostPerPixel<channels>::compute(lle, lri);
-                            }
-                        }
-                    }
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-                }
-            }
-        }
-
-        template <typename T, int winsz, int channels>
-        __global__ void init_data_cost_reduce(int level, int rows, int cols, int h)
-        {
-            int x_out = blockIdx.x;
-            int y_out = blockIdx.y % h;
-            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
-
-            int tid = threadIdx.x;
-
-            if (d < cndisp)
-            {
-                int x0 = x_out << level;
-                int y0 = y_out << level;
-
-                int len = ::min(y0 + winsz, rows) - y0;
-
-                float val = 0.0f;
-                if (x0 + tid < cols)
-                {
-                    if (x0 + tid - d < 0 || d < cth)
-                        val = cdata_weight * cmax_data_term * len;
-                    else
-                    {
-                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - d);
-
-                        for(int y = 0; y < len; ++y)
-                        {
-                            val += DataCostPerPixel<channels>::compute(lle, lri);
-
-                            lle += cimg_step;
-                            lri += cimg_step;
-                        }
-                    }
-                }
-
-                extern __shared__ float smem[];
-
-                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
-
-                T* data_cost = (T*)ctemp + y_out * cmsg_step + x_out;
-
-                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-            }
-        }
-
-
-        template <typename T>
-        void init_data_cost_caller_(int /*rows*/, int /*cols*/, int h, int w, int level, int /*ndisp*/, int channels, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            switch (channels)
-            {
-            case 1: init_data_cost<T, 1><<<grid, threads, 0, stream>>>(h, w, level); break;
-            case 3: init_data_cost<T, 3><<<grid, threads, 0, stream>>>(h, w, level); break;
-            case 4: init_data_cost<T, 4><<<grid, threads, 0, stream>>>(h, w, level); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template <typename T, int winsz>
-        void init_data_cost_reduce_caller_(int rows, int cols, int h, int w, int level, int ndisp, int channels, cudaStream_t stream)
-        {
-            const int threadsNum = 256;
-            const size_t smem_size = threadsNum * sizeof(float);
-
-            dim3 threads(winsz, 1, threadsNum / winsz);
-            dim3 grid(w, h, 1);
-            grid.y *= divUp(ndisp, threads.z);
-
-            switch (channels)
-            {
-            case 1: init_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            case 3: init_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            case 4: init_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(level, rows, cols, h); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template<class T>
-        void init_data_cost(int rows, int cols, T* disp_selected_pyr, T* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream)
-        {
-
-            typedef void (*InitDataCostCaller)(int cols, int rows, int w, int h, int level, int ndisp, int channels, cudaStream_t stream);
-
-            static const InitDataCostCaller init_data_cost_callers[] =
-            {
-                init_data_cost_caller_<T>, init_data_cost_caller_<T>, init_data_cost_reduce_caller_<T, 4>,
-                init_data_cost_reduce_caller_<T, 8>, init_data_cost_reduce_caller_<T, 16>, init_data_cost_reduce_caller_<T, 32>,
-                init_data_cost_reduce_caller_<T, 64>, init_data_cost_reduce_caller_<T, 128>, init_data_cost_reduce_caller_<T, 256>
-            };
-
-            size_t disp_step = msg_step * h;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            if (use_local_init_data_cost == true)
-                get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-            else
-                get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void init_data_cost(int rows, int cols, short* disp_selected_pyr, short* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-        template void init_data_cost(int rows, int cols, float* disp_selected_pyr, float* data_cost_selected, size_t msg_step,
-                    int h, int w, int level, int nr_plane, int ndisp, int channels, bool use_local_init_data_cost, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////// compute data cost //////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T, int channels>
-        __global__ void compute_data_cost(const T* selected_disp_pyr, T* data_cost_, int h, int w, int level, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                int y0 = y << level;
-                int yt = (y + 1) << level;
-
-                int x0 = x << level;
-                int xt = (x + 1) << level;
-
-                const T* selected_disparity = selected_disp_pyr + y/2 * cmsg_step + x/2;
-                T* data_cost = data_cost_ + y * cmsg_step + x;
-
-                for(int d = 0; d < nr_plane; d++)
-                {
-                    float val = 0.0f;
-                    for(int yi = y0; yi < yt; yi++)
-                    {
-                        for(int xi = x0; xi < xt; xi++)
-                        {
-                            int sel_disp = selected_disparity[d * cdisp_step2];
-                            int xr = xi - sel_disp;
-
-                            if (xr < 0 || sel_disp < cth)
-                                val += cdata_weight * cmax_data_term;
-                            else
-                            {
-                                const uchar* left_x = cleft + yi * cimg_step + xi * channels;
-                                const uchar* right_x = cright + yi * cimg_step + xr * channels;
-
-                                val += DataCostPerPixel<channels>::compute(left_x, right_x);
-                            }
-                        }
-                    }
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-                }
-            }
-        }
-
-        template <typename T, int winsz, int channels>
-        __global__ void compute_data_cost_reduce(const T* selected_disp_pyr, T* data_cost_, int level, int rows, int cols, int h, int nr_plane)
-        {
-            int x_out = blockIdx.x;
-            int y_out = blockIdx.y % h;
-            int d = (blockIdx.y / h) * blockDim.z + threadIdx.z;
-
-            int tid = threadIdx.x;
-
-            const T* selected_disparity = selected_disp_pyr + y_out/2 * cmsg_step + x_out/2;
-            T* data_cost = data_cost_ + y_out * cmsg_step + x_out;
-
-            if (d < nr_plane)
-            {
-                int sel_disp = selected_disparity[d * cdisp_step2];
-
-                int x0 = x_out << level;
-                int y0 = y_out << level;
-
-                int len = ::min(y0 + winsz, rows) - y0;
-
-                float val = 0.0f;
-                if (x0 + tid < cols)
-                {
-                    if (x0 + tid - sel_disp < 0 || sel_disp < cth)
-                        val = cdata_weight * cmax_data_term * len;
-                    else
-                    {
-                        const uchar* lle =  cleft + y0 * cimg_step + channels * (x0 + tid    );
-                        const uchar* lri = cright + y0 * cimg_step + channels * (x0 + tid - sel_disp);
-
-                        for(int y = 0; y < len; ++y)
-                        {
-                            val += DataCostPerPixel<channels>::compute(lle, lri);
-
-                            lle += cimg_step;
-                            lri += cimg_step;
-                        }
-                    }
-                }
-
-                extern __shared__ float smem[];
-
-                reduce<winsz>(smem + winsz * threadIdx.z, val, tid, plus<float>());
-
-                if (tid == 0)
-                    data_cost[cdisp_step1 * d] = saturate_cast<T>(val);
-            }
-        }
-
-        template <typename T>
-        void compute_data_cost_caller_(const T* disp_selected_pyr, T* data_cost, int /*rows*/, int /*cols*/,
-                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
-        {
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            switch(channels)
-            {
-            case 1: compute_data_cost<T, 1><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            case 3: compute_data_cost<T, 3><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            case 4: compute_data_cost<T, 4><<<grid, threads, 0, stream>>>(disp_selected_pyr, data_cost, h, w, level, nr_plane); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template <typename T, int winsz>
-        void compute_data_cost_reduce_caller_(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                                      int h, int w, int level, int nr_plane, int channels, cudaStream_t stream)
-        {
-            const int threadsNum = 256;
-            const size_t smem_size = threadsNum * sizeof(float);
-
-            dim3 threads(winsz, 1, threadsNum / winsz);
-            dim3 grid(w, h, 1);
-            grid.y *= divUp(nr_plane, threads.z);
-
-            switch (channels)
-            {
-            case 1: compute_data_cost_reduce<T, winsz, 1><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            case 3: compute_data_cost_reduce<T, winsz, 3><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            case 4: compute_data_cost_reduce<T, winsz, 4><<<grid, threads, smem_size, stream>>>(disp_selected_pyr, data_cost, level, rows, cols, h, nr_plane); break;
-            default: CV_Error(cv::Error::BadNumChannels, "Unsupported channels count");
-            }
-        }
-
-        template<class T>
-        void compute_data_cost(const T* disp_selected_pyr, T* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream)
-        {
-            typedef void (*ComputeDataCostCaller)(const T* disp_selected_pyr, T* data_cost, int rows, int cols,
-                int h, int w, int level, int nr_plane, int channels, cudaStream_t stream);
-
-            static const ComputeDataCostCaller callers[] =
-            {
-                compute_data_cost_caller_<T>, compute_data_cost_caller_<T>, compute_data_cost_reduce_caller_<T, 4>,
-                compute_data_cost_reduce_caller_<T, 8>, compute_data_cost_reduce_caller_<T, 16>, compute_data_cost_reduce_caller_<T, 32>,
-                compute_data_cost_reduce_caller_<T, 64>, compute_data_cost_reduce_caller_<T, 128>, compute_data_cost_reduce_caller_<T, 256>
-            };
-
-            size_t disp_step1 = msg_step * h;
-            size_t disp_step2 = msg_step * h2;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void compute_data_cost(const short* disp_selected_pyr, short* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-        template void compute_data_cost(const float* disp_selected_pyr, float* data_cost, size_t msg_step,
-                               int rows, int cols, int h, int w, int h2, int level, int nr_plane, int channels, cudaStream_t stream);
-
-
-        ///////////////////////////////////////////////////////////////
-        //////////////////////// init message /////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-
-         template <typename T>
-        __device__ void get_first_k_element_increase(T* u_new, T* d_new, T* l_new, T* r_new,
-                                                     const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                                                     T* data_cost_selected, T* disparity_selected_new, T* data_cost_new,
-                                                     const T* data_cost_cur, const T* disparity_selected_cur,
-                                                     int nr_plane, int nr_plane2)
-        {
-            for(int i = 0; i < nr_plane; i++)
-            {
-                T minimum = numeric_limits<T>::max();
-                int id = 0;
-                for(int j = 0; j < nr_plane2; j++)
-                {
-                    T cur = data_cost_new[j * cdisp_step1];
-                    if(cur < minimum)
-                    {
-                        minimum = cur;
-                        id = j;
-                    }
-                }
-
-                data_cost_selected[i * cdisp_step1] = data_cost_cur[id * cdisp_step1];
-                disparity_selected_new[i * cdisp_step1] = disparity_selected_cur[id * cdisp_step2];
-
-                u_new[i * cdisp_step1] = u_cur[id * cdisp_step2];
-                d_new[i * cdisp_step1] = d_cur[id * cdisp_step2];
-                l_new[i * cdisp_step1] = l_cur[id * cdisp_step2];
-                r_new[i * cdisp_step1] = r_cur[id * cdisp_step2];
-
-                data_cost_new[id * cdisp_step1] = numeric_limits<T>::max();
-            }
-        }
-
-        template <typename T>
-        __global__ void init_message(T* u_new_, T* d_new_, T* l_new_, T* r_new_,
-                                     const T* u_cur_, const T* d_cur_, const T* l_cur_, const T* r_cur_,
-                                     T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                                     T* data_cost_selected_, const T* data_cost_,
-                                     int h, int w, int nr_plane, int h2, int w2, int nr_plane2)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y < h && x < w)
-            {
-                const T* u_cur = u_cur_ + ::min(h2-1, y/2 + 1) * cmsg_step + x/2;
-                const T* d_cur = d_cur_ + ::max(0, y/2 - 1)    * cmsg_step + x/2;
-                const T* l_cur = l_cur_ + (y/2)                * cmsg_step + ::min(w2-1, x/2 + 1);
-                const T* r_cur = r_cur_ + (y/2)                * cmsg_step + ::max(0, x/2 - 1);
-
-                T* data_cost_new = (T*)ctemp + y * cmsg_step + x;
-
-                const T* disparity_selected_cur = selected_disp_pyr_cur + y/2 * cmsg_step + x/2;
-                const T* data_cost = data_cost_ + y * cmsg_step + x;
-
-                for(int d = 0; d < nr_plane2; d++)
-                {
-                    int idx2 = d * cdisp_step2;
-
-                    T val  = data_cost[d * cdisp_step1] + u_cur[idx2] + d_cur[idx2] + l_cur[idx2] + r_cur[idx2];
-                    data_cost_new[d * cdisp_step1] = val;
-                }
-
-                T* data_cost_selected = data_cost_selected_ + y * cmsg_step + x;
-                T* disparity_selected_new = selected_disp_pyr_new + y * cmsg_step + x;
-
-                T* u_new = u_new_ + y * cmsg_step + x;
-                T* d_new = d_new_ + y * cmsg_step + x;
-                T* l_new = l_new_ + y * cmsg_step + x;
-                T* r_new = r_new_ + y * cmsg_step + x;
-
-                u_cur = u_cur_ + y/2 * cmsg_step + x/2;
-                d_cur = d_cur_ + y/2 * cmsg_step + x/2;
-                l_cur = l_cur_ + y/2 * cmsg_step + x/2;
-                r_cur = r_cur_ + y/2 * cmsg_step + x/2;
-
-                get_first_k_element_increase(u_new, d_new, l_new, r_new, u_cur, d_cur, l_cur, r_cur,
-                                             data_cost_selected, disparity_selected_new, data_cost_new,
-                                             data_cost, disparity_selected_cur, nr_plane, nr_plane2);
-            }
-        }
-
-
-        template<class T>
-        void init_message(T* u_new, T* d_new, T* l_new, T* r_new,
-                          const T* u_cur, const T* d_cur, const T* l_cur, const T* r_cur,
-                          T* selected_disp_pyr_new, const T* selected_disp_pyr_cur,
-                          T* data_cost_selected, const T* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream)
-        {
-
-            size_t disp_step1 = msg_step * h;
-            size_t disp_step2 = msg_step * h2;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step1, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step2, &disp_step2, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,   &msg_step, sizeof(size_t)) );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x);
-            grid.y = divUp(h, threads.y);
-
-            init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
-                                                       u_cur, d_cur, l_cur, r_cur,
-                                                       selected_disp_pyr_new, selected_disp_pyr_cur,
-                                                       data_cost_selected, data_cost,
-                                                       h, w, nr_plane, h2, w2, nr_plane2);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-
-        template void init_message(short* u_new, short* d_new, short* l_new, short* r_new,
-                          const short* u_cur, const short* d_cur, const short* l_cur, const short* r_cur,
-                          short* selected_disp_pyr_new, const short* selected_disp_pyr_cur,
-                          short* data_cost_selected, const short* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-        template void init_message(float* u_new, float* d_new, float* l_new, float* r_new,
-                          const float* u_cur, const float* d_cur, const float* l_cur, const float* r_cur,
-                          float* selected_disp_pyr_new, const float* selected_disp_pyr_cur,
-                          float* data_cost_selected, const float* data_cost, size_t msg_step,
-                          int h, int w, int nr_plane, int h2, int w2, int nr_plane2, cudaStream_t stream);
-
-        ///////////////////////////////////////////////////////////////
-        ////////////////////  calc all iterations /////////////////////
-        ///////////////////////////////////////////////////////////////
-
-        template <typename T>
-        __device__ void message_per_pixel(const T* data, T* msg_dst, const T* msg1, const T* msg2, const T* msg3,
-                                          const T* dst_disp, const T* src_disp, int nr_plane, volatile T* temp)
-        {
-            T minimum = numeric_limits<T>::max();
-
-            for(int d = 0; d < nr_plane; d++)
-            {
-                int idx = d * cdisp_step1;
-                T val  = data[idx] + msg1[idx] + msg2[idx] + msg3[idx];
-
-                if(val < minimum)
-                    minimum = val;
-
-                msg_dst[idx] = val;
-            }
-
-            float sum = 0;
-            for(int d = 0; d < nr_plane; d++)
-            {
-                float cost_min = minimum + cmax_disc_term;
-                T src_disp_reg = src_disp[d * cdisp_step1];
-
-                for(int d2 = 0; d2 < nr_plane; d2++)
-                    cost_min = fmin(cost_min, msg_dst[d2 * cdisp_step1] + cdisc_single_jump * ::abs(dst_disp[d2 * cdisp_step1] - src_disp_reg));
-
-                temp[d * cdisp_step1] = saturate_cast<T>(cost_min);
-                sum += cost_min;
-            }
-            sum /= nr_plane;
-
-            for(int d = 0; d < nr_plane; d++)
-                msg_dst[d * cdisp_step1] = saturate_cast<T>(temp[d * cdisp_step1] - sum);
-        }
-
-        template <typename T>
-        __global__ void compute_message(T* u_, T* d_, T* l_, T* r_, const T* data_cost_selected, const T* selected_disp_pyr_cur, int h, int w, int nr_plane, int i)
-        {
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-            int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + i) & 1);
-
-            if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-            {
-                const T* data = data_cost_selected + y * cmsg_step + x;
-
-                T* u = u_ + y * cmsg_step + x;
-                T* d = d_ + y * cmsg_step + x;
-                T* l = l_ + y * cmsg_step + x;
-                T* r = r_ + y * cmsg_step + x;
-
-                const T* disp = selected_disp_pyr_cur + y * cmsg_step + x;
-
-                T* temp = (T*)ctemp + y * cmsg_step + x;
-
-                message_per_pixel(data, u, r - 1, u + cmsg_step, l + 1, disp, disp - cmsg_step, nr_plane, temp);
-                message_per_pixel(data, d, d - cmsg_step, r - 1, l + 1, disp, disp + cmsg_step, nr_plane, temp);
-                message_per_pixel(data, l, u + cmsg_step, d - cmsg_step, l + 1, disp, disp - 1, nr_plane, temp);
-                message_per_pixel(data, r, u + cmsg_step, d - cmsg_step, r - 1, disp, disp + 1, nr_plane, temp);
-            }
-        }
-
-
-        template<class T>
-        void calc_all_iterations(T* u, T* d, T* l, T* r, const T* data_cost_selected,
-            const T* selected_disp_pyr_cur, size_t msg_step, int h, int w, int nr_plane, int iters, cudaStream_t stream)
-        {
-            size_t disp_step = msg_step * h;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(w, threads.x << 1);
-            grid.y = divUp(h, threads.y);
-
-            for(int t = 0; t < iters; ++t)
-            {
-                compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-        };
-
-        template void calc_all_iterations(short* u, short* d, short* l, short* r, const short* data_cost_selected, const short* selected_disp_pyr_cur, size_t msg_step,
-            int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-        template void calc_all_iterations(float* u, float* d, float* l, float* r, const float* data_cost_selected, const float* selected_disp_pyr_cur, size_t msg_step,
-            int h, int w, int nr_plane, int iters, cudaStream_t stream);
-
-
-        ///////////////////////////////////////////////////////////////
-        /////////////////////////// output ////////////////////////////
-        ///////////////////////////////////////////////////////////////
-
-
-        template <typename T>
-        __global__ void compute_disp(const T* u_, const T* d_, const T* l_, const T* r_,
-                                     const T* data_cost_selected, const T* disp_selected_pyr,
-                                     PtrStepSz<short> disp, int nr_plane)
-        {
-            int x = blockIdx.x * blockDim.x + threadIdx.x;
-            int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-            {
-                const T* data = data_cost_selected + y * cmsg_step + x;
-                const T* disp_selected = disp_selected_pyr + y * cmsg_step + x;
-
-                const T* u = u_ + (y+1) * cmsg_step + (x+0);
-                const T* d = d_ + (y-1) * cmsg_step + (x+0);
-                const T* l = l_ + (y+0) * cmsg_step + (x+1);
-                const T* r = r_ + (y+0) * cmsg_step + (x-1);
-
-                int best = 0;
-                T best_val = numeric_limits<T>::max();
-                for (int i = 0; i < nr_plane; ++i)
-                {
-                    int idx = i * cdisp_step1;
-                    T val = data[idx]+ u[idx] + d[idx] + l[idx] + r[idx];
-
-                    if (val < best_val)
-                    {
-                        best_val = val;
-                        best = saturate_cast<short>(disp_selected[idx]);
-                    }
-                }
-                disp(y, x) = best;
-            }
-        }
-
-        template<class T>
-        void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream)
-        {
-            size_t disp_step = disp.rows * msg_step;
-            cudaSafeCall( cudaMemcpyToSymbol(cdisp_step1, &disp_step, sizeof(size_t)) );
-            cudaSafeCall( cudaMemcpyToSymbol(cmsg_step,  &msg_step,  sizeof(size_t)) );
-
-            dim3 threads(32, 8, 1);
-            dim3 grid(1, 1, 1);
-
-            grid.x = divUp(disp.cols, threads.x);
-            grid.y = divUp(disp.rows, threads.y);
-
-            compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected, disp, nr_plane);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        template void compute_disp(const short* u, const short* d, const short* l, const short* r, const short* data_cost_selected, const short* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
-
-        template void compute_disp(const float* u, const float* d, const float* l, const float* r, const float* data_cost_selected, const float* disp_selected, size_t msg_step,
-            const PtrStepSz<short>& disp, int nr_plane, cudaStream_t stream);
-    } // namespace stereocsbp
-}}} // namespace cv { namespace gpu { namespace cudev {
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -1,92 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef OPENCV_GPU_TEXTURE_BINDER_HPP_
-#define OPENCV_GPU_TEXTURE_BINDER_HPP_
-
-#include "opencv2/gpu/devmem2d.hpp"
-#include <safe_call.hpp>
-
-namespace cv
-{
-  namespace gpu
-  {
-    class TextureBinder
-    {
-    public:
-      template<class T, enum cudaTextureReadMode readMode>
-      TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
-      {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
-      }
-
-      template<class T, enum cudaTextureReadMode readMode>
-      TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
-      {
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-        cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
-      }
-
-      template<class A, class T, enum cudaTextureReadMode readMode>
-      TextureBinder(const A& arr, const struct texture<T, 2, readMode>& tex, const cudaChannelFormatDesc& desc) : texref(&tex)
-      {
-        cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
-      }
-
-
-      ~TextureBinder()
-      {
-        cudaSafeCall( cudaUnbindTexture(texref) );
-      }
-    private:
-      const struct textureReference *texref;
-    };
-  }
-
-  namespace cuda
-  {
-      using cv::gpu::TextureBinder;
-  }
-}
-
-#endif /* OPENCV_GPU_TEXTURE_BINDER_HPP_*/
--- a/modules/gpu/src/cuda/tvl1flow.cu
+++ b/modules/gpu/src/cuda/tvl1flow.cu
@@ -1,332 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/limits.hpp"
-
-using namespace cv::gpu;
-using namespace cv::gpu::cudev;
-
-////////////////////////////////////////////////////////////
-// centeredGradient
-
-namespace tvl1flow
-{
-    __global__ void centeredGradientKernel(const PtrStepSzf src, PtrStepf dx, PtrStepf dy)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= src.cols || y >= src.rows)
-            return;
-
-        dx(y, x) = 0.5f * (src(y, ::min(x + 1, src.cols - 1)) - src(y, ::max(x - 1, 0)));
-        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
-    }
-
-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-
-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// warpBackward
-
-namespace tvl1flow
-{
-    static __device__ __forceinline__ float bicubicCoeff(float x_)
-    {
-        float x = fabsf(x_);
-        if (x <= 1.0f)
-        {
-            return x * x * (1.5f * x - 2.5f) + 1.0f;
-        }
-        else if (x < 2.0f)
-        {
-            return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;
-        }
-        else
-        {
-            return 0.0f;
-        }
-    }
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __global__ void warpBackwardKernel(const PtrStepSzf I0, const PtrStepf u1, const PtrStepf u2, PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= I0.cols || y >= I0.rows)
-            return;
-
-        const float u1Val = u1(y, x);
-        const float u2Val = u2(y, x);
-
-        const float wx = x + u1Val;
-        const float wy = y + u2Val;
-
-        const int xmin = ::ceilf(wx - 2.0f);
-        const int xmax = ::floorf(wx + 2.0f);
-
-        const int ymin = ::ceilf(wy - 2.0f);
-        const int ymax = ::floorf(wy + 2.0f);
-
-        float sum  = 0.0f;
-        float sumx = 0.0f;
-        float sumy = 0.0f;
-        float wsum = 0.0f;
-
-        for (int cy = ymin; cy <= ymax; ++cy)
-        {
-            for (int cx = xmin; cx <= xmax; ++cx)
-            {
-                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                sum  += w * tex2D(tex_I1 , cx, cy);
-                sumx += w * tex2D(tex_I1x, cx, cy);
-                sumy += w * tex2D(tex_I1y, cx, cy);
-
-                wsum += w;
-            }
-        }
-
-        const float coeff = 1.0f / wsum;
-
-        const float I1wVal  = sum  * coeff;
-        const float I1wxVal = sumx * coeff;
-        const float I1wyVal = sumy * coeff;
-
-        I1w(y, x)  = I1wVal;
-        I1wx(y, x) = I1wxVal;
-        I1wy(y, x) = I1wyVal;
-
-        const float Ix2 = I1wxVal * I1wxVal;
-        const float Iy2 = I1wyVal * I1wyVal;
-
-        // store the |Grad(I1)|^2
-        grad(y, x) = Ix2 + Iy2;
-
-        // compute the constant part of the rho function
-        const float I0Val = I0(y, x);
-        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
-    }
-
-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
-
-        bindTexture(&tex_I1 , I1);
-        bindTexture(&tex_I1x, I1x);
-        bindTexture(&tex_I1y, I1y);
-
-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// estimateU
-
-namespace tvl1flow
-{
-    __device__ float divergence(const PtrStepf& v1, const PtrStepf& v2, int y, int x)
-    {
-        if (x > 0 && y > 0)
-        {
-            const float v1x = v1(y, x) - v1(y, x - 1);
-            const float v2y = v2(y, x) - v2(y - 1, x);
-            return v1x + v2y;
-        }
-        else
-        {
-            if (y > 0)
-                return v1(y, 0) + v2(y, 0) - v2(y - 1, 0);
-            else
-            {
-                if (x > 0)
-                    return v1(0, x) - v1(0, x - 1) + v2(0, x);
-                else
-                    return v1(0, 0) + v2(0, 0);
-            }
-        }
-    }
-
-    __global__ void estimateUKernel(const PtrStepSzf I1wx, const PtrStepf I1wy,
-                              const PtrStepf grad, const PtrStepf rho_c,
-                              const PtrStepf p11, const PtrStepf p12, const PtrStepf p21, const PtrStepf p22,
-                              PtrStepf u1, PtrStepf u2, PtrStepf error,
-                              const float l_t, const float theta)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= I1wx.cols || y >= I1wx.rows)
-            return;
-
-        const float I1wxVal = I1wx(y, x);
-        const float I1wyVal = I1wy(y, x);
-        const float gradVal = grad(y, x);
-        const float u1OldVal = u1(y, x);
-        const float u2OldVal = u2(y, x);
-
-        const float rho = rho_c(y, x) + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);
-
-        // estimate the values of the variable (v1, v2) (thresholding operator TH)
-
-        float d1 = 0.0f;
-        float d2 = 0.0f;
-
-        if (rho < -l_t * gradVal)
-        {
-            d1 = l_t * I1wxVal;
-            d2 = l_t * I1wyVal;
-        }
-        else if (rho > l_t * gradVal)
-        {
-            d1 = -l_t * I1wxVal;
-            d2 = -l_t * I1wyVal;
-        }
-        else if (gradVal > numeric_limits<float>::epsilon())
-        {
-            const float fi = -rho / gradVal;
-            d1 = fi * I1wxVal;
-            d2 = fi * I1wyVal;
-        }
-
-        const float v1 = u1OldVal + d1;
-        const float v2 = u2OldVal + d2;
-
-        // compute the divergence of the dual variable (p1, p2)
-
-        const float div_p1 = divergence(p11, p12, y, x);
-        const float div_p2 = divergence(p21, p22, y, x);
-
-        // estimate the values of the optical flow (u1, u2)
-
-        const float u1NewVal = v1 + theta * div_p1;
-        const float u2NewVal = v2 + theta * div_p2;
-
-        u1(y, x) = u1NewVal;
-        u2(y, x) = u2NewVal;
-
-        const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);
-        const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);
-        error(y, x) = n1 + n2;
-    }
-
-    void estimateU(PtrStepSzf I1wx, PtrStepSzf I1wy,
-                   PtrStepSzf grad, PtrStepSzf rho_c,
-                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22,
-                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf error,
-                   float l_t, float theta)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));
-
-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, u1, u2, error, l_t, theta);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-////////////////////////////////////////////////////////////
-// estimateDualVariables
-
-namespace tvl1flow
-{
-    __global__ void estimateDualVariablesKernel(const PtrStepSzf u1, const PtrStepf u2, PtrStepf p11, PtrStepf p12, PtrStepf p21, PtrStepf p22, const float taut)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= u1.cols || y >= u1.rows)
-            return;
-
-        const float u1x = u1(y, ::min(x + 1, u1.cols - 1)) - u1(y, x);
-        const float u1y = u1(::min(y + 1, u1.rows - 1), x) - u1(y, x);
-
-        const float u2x = u2(y, ::min(x + 1, u1.cols - 1)) - u2(y, x);
-        const float u2y = u2(::min(y + 1, u1.rows - 1), x) - u2(y, x);
-
-        const float g1 = ::hypotf(u1x, u1y);
-        const float g2 = ::hypotf(u2x, u2y);
-
-        const float ng1 = 1.0f + taut * g1;
-        const float ng2 = 1.0f + taut * g2;
-
-        p11(y, x) = (p11(y, x) + taut * u1x) / ng1;
-        p12(y, x) = (p12(y, x) + taut * u1y) / ng1;
-        p21(y, x) = (p21(y, x) + taut * u2x) / ng2;
-        p22(y, x) = (p22(y, x) + taut * u2y) / ng2;
-    }
-
-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, float taut)
-    {
-        const dim3 block(32, 8);
-        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));
-
-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, p11, p12, p21, p22, taut);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}
-
-#endif // !defined CUDA_DISABLER
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
@@ -1,389 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#if !defined CUDA_DISABLER
-
-#include "opencv2/core/cuda/common.hpp"
-#include "opencv2/core/cuda/border_interpolate.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-#include "opencv2/core/cuda/vec_math.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/filters.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        __constant__ float c_warpMat[3 * 3];
-
-        struct AffineTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float xcoo = c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2];
-                const float ycoo = c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5];
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        struct PerspectiveTransform
-        {
-            static __device__ __forceinline__ float2 calcCoord(int x, int y)
-            {
-                const float coeff = 1.0f / (c_warpMat[6] * x + c_warpMat[7] * y + c_warpMat[8]);
-
-                const float xcoo = coeff * (c_warpMat[0] * x + c_warpMat[1] * y + c_warpMat[2]);
-                const float ycoo = coeff * (c_warpMat[3] * x + c_warpMat[4] * y + c_warpMat[5]);
-
-                return make_float2(xcoo, ycoo);
-            }
-        };
-
-        ///////////////////////////////////////////////////////////////////
-        // Build Maps
-
-        template <class Transform> __global__ void buildWarpMaps(PtrStepSzf xmap, PtrStepf ymap)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < xmap.cols && y < xmap.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                xmap(y, x) = coord.x;
-                ymap(y, x) = coord.y;
-            }
-        }
-
-        template <class Transform> void buildWarpMaps_caller(PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            dim3 block(32, 8);
-            dim3 grid(divUp(xmap.cols, block.x), divUp(xmap.rows, block.y));
-
-            buildWarpMaps<Transform><<<grid, block, 0, stream>>>(xmap, ymap);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        void buildWarpAffineMaps_gpu(float coeffs[2 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<AffineTransform>(xmap, ymap, stream);
-        }
-
-        void buildWarpPerspectiveMaps_gpu(float coeffs[3 * 3], PtrStepSzf xmap, PtrStepSzf ymap, cudaStream_t stream)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            buildWarpMaps_caller<PerspectiveTransform>(xmap, ymap, stream);
-        }
-
-        ///////////////////////////////////////////////////////////////////
-        // Warp
-
-        template <class Transform, class Ptr2D, typename T> __global__ void warp(const Ptr2D src, PtrStepSz<T> dst)
-        {
-            const int x = blockDim.x * blockIdx.x + threadIdx.x;
-            const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < dst.cols && y < dst.rows)
-            {
-                const float2 coord = Transform::calcCoord(x, y);
-
-                dst.ptr(y)[x] = saturate_cast<T>(src(coord.y, coord.x));
-            }
-        }
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool)
-            {
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-            }
-        };
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStream
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, bool)
-            {
-                (void)xoff;
-                (void)yoff;
-                (void)srcWhole;
-
-                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
-
-                dim3 block(32, 8);
-                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
-
-                warp<Transform><<<grid, block>>>(filter_src, dst);
-                cudaSafeCall( cudaGetLastError() );
-
-                cudaSafeCall( cudaDeviceSynchronize() );
-            }
-        };
-
-        #define OPENCV_GPU_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_GPU_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_GPU_IMPLEMENT_WARP_TEX(float4)
-
-        #undef OPENCV_GPU_IMPLEMENT_WARP_TEX
-
-        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
-        {
-            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20)
-            {
-                if (stream == 0)
-                    WarpDispatcherNonStream<Transform, Filter, B, T>::call(src, srcWhole, xoff, yoff, dst, borderValue, cc20);
-                else
-                    WarpDispatcherStream<Transform, Filter, B, T>::call(src, dst, borderValue, stream, cc20);
-            }
-        };
-
-        template <class Transform, typename T>
-        void warp_caller(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzb dst, int interpolation,
-                         int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            typedef void (*func_t)(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, cudaStream_t stream, bool cc20);
-
-            static const func_t funcs[3][5] =
-            {
-                {
-                    WarpDispatcher<Transform, PointFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, PointFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, LinearFilter, BrdWrap, T>::call
-                },
-                {
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect101, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReplicate, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdConstant, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdReflect, T>::call,
-                    WarpDispatcher<Transform, CubicFilter, BrdWrap, T>::call
-                }
-            };
-
-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
-        }
-
-        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 2 * 3 * sizeof(float)) );
-
-            warp_caller<AffineTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
-        }
-
-        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template <typename T> void warpPerspective_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation,
-                                                  int borderMode, const float* borderValue, cudaStream_t stream, bool cc20)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_warpMat, coeffs, 3 * 3 * sizeof(float)) );
-
-            warp_caller<PerspectiveTransform, T>(src, srcWhole, xoff, yoff, dst, interpolation, borderMode, borderValue, stream, cc20);
-        }
-
-        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
-        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-    } // namespace imgproc
-}}} // namespace cv { namespace gpu { namespace cudev
-
-
-#endif /* CUDA_DISABLER */
--- a/modules/gpu/src/cuvid_video_source.cpp
+++ b/modules/gpu/src/cuvid_video_source.cpp
@@ -1,104 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "cuvid_video_source.h"
-
-#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
-
-cv::gpu::detail::CuvidVideoSource::CuvidVideoSource(const String& fname)
-{
-    CUVIDSOURCEPARAMS params;
-    std::memset(&params, 0, sizeof(CUVIDSOURCEPARAMS));
-
-    // Fill parameter struct
-    params.pUserData = this;                        // will be passed to data handlers
-    params.pfnVideoDataHandler = HandleVideoData;   // our local video-handler callback
-    params.pfnAudioDataHandler = 0;
-
-    // now create the actual source
-    CUresult res = cuvidCreateVideoSource(&videoSource_, fname.c_str(), &params);
-    if (res == CUDA_ERROR_INVALID_SOURCE)
-        throw std::runtime_error("Unsupported video source");
-    cuSafeCall( res );
-
-    CUVIDEOFORMAT vidfmt;
-    cuSafeCall( cuvidGetSourceVideoFormat(videoSource_, &vidfmt, 0) );
-
-    format_.codec = static_cast<VideoReader_GPU::Codec>(vidfmt.codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(vidfmt.chroma_format);
-    format_.width = vidfmt.coded_width;
-    format_.height = vidfmt.coded_height;
-}
-
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::CuvidVideoSource::format() const
-{
-    return format_;
-}
-
-void cv::gpu::detail::CuvidVideoSource::start()
-{
-    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Started) );
-}
-
-void cv::gpu::detail::CuvidVideoSource::stop()
-{
-    cuSafeCall( cuvidSetVideoSourceState(videoSource_, cudaVideoState_Stopped) );
-}
-
-bool cv::gpu::detail::CuvidVideoSource::isStarted() const
-{
-    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Started);
-}
-
-bool cv::gpu::detail::CuvidVideoSource::hasError() const
-{
-    return (cuvidGetVideoSourceState(videoSource_) == cudaVideoState_Error);
-}
-
-int CUDAAPI cv::gpu::detail::CuvidVideoSource::HandleVideoData(void* userData, CUVIDSOURCEDATAPACKET* packet)
-{
-    CuvidVideoSource* thiz = static_cast<CuvidVideoSource*>(userData);
-
-    return thiz->parseVideoData(packet->payload, packet->payload_size, (packet->flags & CUVID_PKT_ENDOFSTREAM) != 0);
-}
-
-#endif // defined(HAVE_CUDA) && !defined(__APPLE__)
--- a/modules/gpu/src/cuvid_video_source.h
+++ b/modules/gpu/src/cuvid_video_source.h
@@ -1,90 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __CUVUD_VIDEO_SOURCE_H__
-#define __CUVUD_VIDEO_SOURCE_H__
-
-#include "precomp.hpp"
-
-#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
-
-namespace cv { namespace gpu
-{
-    namespace detail
-    {
-        class CuvidVideoSource : public VideoReader_GPU::VideoSource
-        {
-        public:
-            explicit CuvidVideoSource(const String& fname);
-            ~CuvidVideoSource() { cuvidDestroyVideoSource(videoSource_); }
-
-            VideoReader_GPU::FormatInfo format() const;
-            void start();
-            void stop();
-            bool isStarted() const;
-            bool hasError() const;
-
-        private:
-            CuvidVideoSource(const CuvidVideoSource&);
-            CuvidVideoSource& operator =(const CuvidVideoSource&);
-
-            // Callback for handling packages of demuxed video data.
-            //
-            // Parameters:
-            //      pUserData - Pointer to user data. We must pass a pointer to a
-            //          VideoSourceData struct here, that contains a valid CUvideoparser
-            //          and FrameQueue.
-            //      pPacket - video-source data packet.
-            //
-            // NOTE: called from a different thread that doesn't not have a cuda context
-            //
-            static int CUDAAPI HandleVideoData(void* pUserData, CUVIDSOURCEDATAPACKET* pPacket);
-
-            CUvideosource videoSource_;
-            VideoReader_GPU::FormatInfo format_;
-        };
-    }
-}}
-
-#endif // defined(HAVE_CUDA) && !defined(__APPLE__)
-
-#endif // __CUVUD_VIDEO_SOURCE_H__
--- a/modules/gpu/src/cvt_color_internal.h
+++ b/modules/gpu/src/cvt_color_internal.h
@@ -1,274 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __cvt_color_internal_h__
-#define __cvt_color_internal_h__
-
-namespace cv { namespace gpu { namespace cudev
-{
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
-    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
-
-#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
-
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
-
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
-}}}
-
-#endif
--- a/modules/gpu/src/denoising.cpp
+++ b/modules/gpu/src/denoising.cpp
@@ -1,198 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-void cv::gpu::bilateralFilter(const GpuMat&, GpuMat&, int, float, float, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::nonLocalMeans(const GpuMat&, GpuMat&, float, int, int, int, Stream&) { throw_no_cuda(); }
-
-void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat&, GpuMat&, float, int, int, Stream&) { throw_no_cuda(); }
-void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat&, GpuMat&, float, float, int, int, Stream&) { throw_no_cuda(); }
-
-
-#else
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (brute force)
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        template<typename T>
-        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
-
-        template<typename T>
-        void nlm_bruteforce_gpu(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
-    }
-}}}
-
-void cv::gpu::bilateralFilter(const GpuMat& src, GpuMat& dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& s)
-{
-    using cv::gpu::cudev::imgproc::bilateral_filter_gpu;
-
-    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
-
-    static const func_t funcs[6][4] =
-    {
-        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
-        {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
-        {bilateral_filter_gpu<ushort>     , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3>     , bilateral_filter_gpu<ushort4>     },
-        {bilateral_filter_gpu<short>      , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3>      , bilateral_filter_gpu<short4>      },
-        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
-        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
-    };
-
-    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
-    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
-
-
-    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
-    kernel_size = std::max(radius, 1)*2 + 1;
-
-    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-    const func_t func = funcs[src.depth()][src.channels() - 1];
-    CV_Assert(func != 0);
-
-    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(src.size(), src.type());
-    func(src, dst, kernel_size, sigma_spatial, sigma_color, gpuBorderType, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::nonLocalMeans(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, int borderMode, Stream& s)
-{
-    using cv::gpu::cudev::imgproc::nlm_bruteforce_gpu;
-    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int search_radius, int block_radius, float h, int borderMode, cudaStream_t stream);
-
-    static const func_t funcs[4] = { nlm_bruteforce_gpu<uchar>, nlm_bruteforce_gpu<uchar2>, nlm_bruteforce_gpu<uchar3>, 0/*nlm_bruteforce_gpu<uchar4>,*/ };
-
-    CV_Assert(src.type() == CV_8U || src.type() == CV_8UC2 || src.type() == CV_8UC3);
-
-    const func_t func = funcs[src.channels() - 1];
-    CV_Assert(func != 0);
-
-    int b = borderMode;
-    CV_Assert(b == BORDER_REFLECT101 || b == BORDER_REPLICATE || b == BORDER_CONSTANT || b == BORDER_REFLECT || b == BORDER_WRAP);
-
-    int gpuBorderType;
-    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
-
-    dst.create(src.size(), src.type());
-    func(src, dst, search_window/2, block_window/2, h, gpuBorderType, StreamAccessor::getStream(s));
-}
-
-
-//////////////////////////////////////////////////////////////////////////////////
-//// Non Local Means Denosing (fast approxinate)
-
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace imgproc
-    {
-        void nln_fast_get_buffer_size(const PtrStepSzb& src, int search_window, int block_window, int& buffer_cols, int& buffer_rows);
-
-        template<typename T>
-        void nlm_fast_gpu(const PtrStepSzb& src, PtrStepSzb dst, PtrStepi buffer,
-                          int search_window, int block_window, float h, cudaStream_t stream);
-
-        void fnlm_split_channels(const PtrStepSz<uchar3>& lab, PtrStepb l, PtrStep<uchar2> ab, cudaStream_t stream);
-        void fnlm_merge_channels(const PtrStepb& l, const PtrStep<uchar2>& ab, PtrStepSz<uchar3> lab, cudaStream_t stream);
-     }
-}}}
-
-void cv::gpu::FastNonLocalMeansDenoising::simpleMethod(const GpuMat& src, GpuMat& dst, float h, int search_window, int block_window, Stream& s)
-{
-    CV_Assert(src.depth() == CV_8U && src.channels() < 4);
-
-    int border_size = search_window/2 + block_window/2;
-    Size esize = src.size() + Size(border_size, border_size) * 2;
-
-    cv::gpu::ensureSizeIsEnough(esize, CV_8UC3, extended_src_buffer);
-    GpuMat extended_src(esize, src.type(), extended_src_buffer.ptr(), extended_src_buffer.step);
-
-    cv::gpu::copyMakeBorder(src, extended_src, border_size, border_size, border_size, border_size, cv::BORDER_DEFAULT, Scalar(), s);
-    GpuMat src_hdr = extended_src(Rect(Point2i(border_size, border_size), src.size()));
-
-    int bcols, brows;
-    cudev::imgproc::nln_fast_get_buffer_size(src_hdr, search_window, block_window, bcols, brows);
-    buffer.create(brows, bcols, CV_32S);
-
-    using namespace cv::gpu::cudev::imgproc;
-    typedef void (*nlm_fast_t)(const PtrStepSzb&, PtrStepSzb, PtrStepi, int, int, float, cudaStream_t);
-    static const nlm_fast_t funcs[] = { nlm_fast_gpu<uchar>, nlm_fast_gpu<uchar2>, nlm_fast_gpu<uchar3>, 0};
-
-    dst.create(src.size(), src.type());
-    funcs[src.channels()-1](src_hdr, dst, buffer, search_window, block_window, h, StreamAccessor::getStream(s));
-}
-
-void cv::gpu::FastNonLocalMeansDenoising::labMethod( const GpuMat& src, GpuMat& dst, float h_luminance, float h_color, int search_window, int block_window, Stream& s)
-{
-    CV_Assert(src.type() == CV_8UC3);
-
-    lab.create(src.size(), src.type());
-    cv::gpu::cvtColor(src, lab, cv::COLOR_BGR2Lab, 0, s);
-
-    l.create(src.size(), CV_8U);
-    ab.create(src.size(), CV_8UC2);
-    cudev::imgproc::fnlm_split_channels(lab, l, ab, StreamAccessor::getStream(s));
-
-    simpleMethod(l, l, h_luminance, search_window, block_window, s);
-    simpleMethod(ab, ab, h_color, search_window, block_window, s);
-
-    cudev::imgproc::fnlm_merge_channels(l, ab, lab, StreamAccessor::getStream(s));
-    cv::gpu::cvtColor(lab, dst, cv::COLOR_Lab2BGR, 0, s);
-}
-
-
-#endif
-
-
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/error.cpp
+++ b/modules/gpu/src/error.cpp
@@ -1,188 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#ifdef HAVE_CUDA
-
-namespace
-{
-    #define error_entry(entry)  { entry, #entry }
-
-    struct ErrorEntry
-    {
-        int code;
-        const char* str;
-    };
-
-    struct ErrorEntryComparer
-    {
-        int code;
-        ErrorEntryComparer(int code_) : code(code_) {}
-        bool operator()(const ErrorEntry& e) const { return e.code == code; }
-    };
-
-    String getErrorString(int code, const ErrorEntry* errors, size_t n)
-    {
-        size_t idx = std::find_if(errors, errors + n, ErrorEntryComparer(code)) - errors;
-
-        const char* msg = (idx != n) ? errors[idx].str : "Unknown error code";
-        String str = cv::format("%s [Code = %d]", msg, code);
-
-        return str;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // NCV errors
-
-    const ErrorEntry ncv_errors [] =
-    {
-        error_entry( NCV_SUCCESS ),
-        error_entry( NCV_UNKNOWN_ERROR ),
-        error_entry( NCV_CUDA_ERROR ),
-        error_entry( NCV_NPP_ERROR ),
-        error_entry( NCV_FILE_ERROR ),
-        error_entry( NCV_NULL_PTR ),
-        error_entry( NCV_INCONSISTENT_INPUT ),
-        error_entry( NCV_TEXTURE_BIND_ERROR ),
-        error_entry( NCV_DIMENSIONS_INVALID ),
-        error_entry( NCV_INVALID_ROI ),
-        error_entry( NCV_INVALID_STEP ),
-        error_entry( NCV_INVALID_SCALE ),
-        error_entry( NCV_INVALID_SCALE ),
-        error_entry( NCV_ALLOCATOR_NOT_INITIALIZED ),
-        error_entry( NCV_ALLOCATOR_BAD_ALLOC ),
-        error_entry( NCV_ALLOCATOR_BAD_DEALLOC ),
-        error_entry( NCV_ALLOCATOR_INSUFFICIENT_CAPACITY ),
-        error_entry( NCV_ALLOCATOR_DEALLOC_ORDER ),
-        error_entry( NCV_ALLOCATOR_BAD_REUSE ),
-        error_entry( NCV_MEM_COPY_ERROR ),
-        error_entry( NCV_MEM_RESIDENCE_ERROR ),
-        error_entry( NCV_MEM_INSUFFICIENT_CAPACITY ),
-        error_entry( NCV_HAAR_INVALID_PIXEL_STEP ),
-        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CLASSIFIER ),
-        error_entry( NCV_HAAR_TOO_MANY_FEATURES_IN_CASCADE ),
-        error_entry( NCV_HAAR_TOO_LARGE_FEATURES ),
-        error_entry( NCV_HAAR_XML_LOADING_EXCEPTION ),
-        error_entry( NCV_NOIMPL_HAAR_TILTED_FEATURES ),
-        error_entry( NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW ),
-        error_entry( NPPST_SUCCESS ),
-        error_entry( NPPST_ERROR ),
-        error_entry( NPPST_CUDA_KERNEL_EXECUTION_ERROR ),
-        error_entry( NPPST_NULL_POINTER_ERROR ),
-        error_entry( NPPST_TEXTURE_BIND_ERROR ),
-        error_entry( NPPST_MEMCPY_ERROR ),
-        error_entry( NPPST_MEM_ALLOC_ERR ),
-        error_entry( NPPST_MEMFREE_ERR ),
-        error_entry( NPPST_INVALID_ROI ),
-        error_entry( NPPST_INVALID_STEP ),
-        error_entry( NPPST_INVALID_SCALE ),
-        error_entry( NPPST_MEM_INSUFFICIENT_BUFFER ),
-        error_entry( NPPST_MEM_RESIDENCE_ERROR ),
-        error_entry( NPPST_MEM_INTERNAL_ERROR )
-    };
-
-    const size_t ncv_error_num = sizeof(ncv_errors) / sizeof(ncv_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUFFT errors
-
-    const ErrorEntry cufft_errors[] =
-    {
-        error_entry( CUFFT_INVALID_PLAN ),
-        error_entry( CUFFT_ALLOC_FAILED ),
-        error_entry( CUFFT_INVALID_TYPE ),
-        error_entry( CUFFT_INVALID_VALUE ),
-        error_entry( CUFFT_INTERNAL_ERROR ),
-        error_entry( CUFFT_EXEC_FAILED ),
-        error_entry( CUFFT_SETUP_FAILED ),
-        error_entry( CUFFT_INVALID_SIZE ),
-        error_entry( CUFFT_UNALIGNED_DATA )
-    };
-
-    const int cufft_error_num = sizeof(cufft_errors) / sizeof(cufft_errors[0]);
-
-    //////////////////////////////////////////////////////////////////////////
-    // CUBLAS errors
-
-    const ErrorEntry cublas_errors[] =
-    {
-        error_entry( CUBLAS_STATUS_SUCCESS ),
-        error_entry( CUBLAS_STATUS_NOT_INITIALIZED ),
-        error_entry( CUBLAS_STATUS_ALLOC_FAILED ),
-        error_entry( CUBLAS_STATUS_INVALID_VALUE ),
-        error_entry( CUBLAS_STATUS_ARCH_MISMATCH ),
-        error_entry( CUBLAS_STATUS_MAPPING_ERROR ),
-        error_entry( CUBLAS_STATUS_EXECUTION_FAILED ),
-        error_entry( CUBLAS_STATUS_INTERNAL_ERROR )
-    };
-
-    const int cublas_error_num = sizeof(cublas_errors) / sizeof(cublas_errors[0]);
-}
-
-namespace cv
-{
-    namespace gpu
-    {
-        void ncvError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, ncv_errors, ncv_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
-
-        void cufftError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, cufft_errors, cufft_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
-
-        void cublasError(int code, const char* file, const int line, const char* func)
-        {
-            String msg = getErrorString(code, cublas_errors, cublas_error_num);
-            cv::error(cv::Error::GpuApiCallError, msg, func, file, line);
-        }
-    }
-}
-
-#endif
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@@ -1,170 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-
-#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
-
-cv::gpu::FAST_GPU::FAST_GPU(int, bool, double) { throw_no_cuda(); }
-void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, GpuMat&) { throw_no_cuda(); }
-void cv::gpu::FAST_GPU::operator ()(const GpuMat&, const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::gpu::FAST_GPU::convertKeypoints(const Mat&, std::vector<KeyPoint>&) { throw_no_cuda(); }
-void cv::gpu::FAST_GPU::release() { throw_no_cuda(); }
-int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat&, const GpuMat&) { throw_no_cuda(); return 0; }
-int cv::gpu::FAST_GPU::getKeyPoints(GpuMat&) { throw_no_cuda(); return 0; }
-
-#else /* !defined (HAVE_CUDA) */
-
-cv::gpu::FAST_GPU::FAST_GPU(int _threshold, bool _nonmaxSupression, double _keypointsRatio) :
-    nonmaxSupression(_nonmaxSupression), threshold(_threshold), keypointsRatio(_keypointsRatio), count_(0)
-{
-}
-
-void cv::gpu::FAST_GPU::operator ()(const GpuMat& image, const GpuMat& mask, std::vector<KeyPoint>& keypoints)
-{
-    if (image.empty())
-        return;
-
-    (*this)(image, mask, d_keypoints_);
-    downloadKeypoints(d_keypoints_, keypoints);
-}
-
-void cv::gpu::FAST_GPU::downloadKeypoints(const GpuMat& d_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (d_keypoints.empty())
-        return;
-
-    Mat h_keypoints(d_keypoints);
-    convertKeypoints(h_keypoints, keypoints);
-}
-
-void cv::gpu::FAST_GPU::convertKeypoints(const Mat& h_keypoints, std::vector<KeyPoint>& keypoints)
-{
-    if (h_keypoints.empty())
-        return;
-
-    CV_Assert(h_keypoints.rows == ROWS_COUNT && h_keypoints.elemSize() == 4);
-
-    int npoints = h_keypoints.cols;
-
-    keypoints.resize(npoints);
-
-    const short2* loc_row = h_keypoints.ptr<short2>(LOCATION_ROW);
-    const float* response_row = h_keypoints.ptr<float>(RESPONSE_ROW);
-
-    for (int i = 0; i < npoints; ++i)
-    {
-        KeyPoint kp(loc_row[i].x, loc_row[i].y, static_cast<float>(FEATURE_SIZE), -1, response_row[i]);
-        keypoints[i] = kp;
-    }
-}
-
-void cv::gpu::FAST_GPU::operator ()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints)
-{
-    calcKeyPointsLocation(img, mask);
-    keypoints.cols = getKeyPoints(keypoints);
-}
-
-namespace cv { namespace gpu { namespace cudev
-{
-    namespace fast
-    {
-        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold);
-        int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response);
-    }
-}}}
-
-int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& mask)
-{
-    using namespace cv::gpu::cudev::fast;
-
-    CV_Assert(img.type() == CV_8UC1);
-    CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
-
-    int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
-
-    ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
-
-    if (nonmaxSupression)
-    {
-        ensureSizeIsEnough(img.size(), CV_32SC1, score_);
-        score_.setTo(Scalar::all(0));
-    }
-
-    count_ = calcKeypoints_gpu(img, mask, kpLoc_.ptr<short2>(), maxKeypoints, nonmaxSupression ? score_ : PtrStepSzi(), threshold);
-    count_ = std::min(count_, maxKeypoints);
-
-    return count_;
-}
-
-int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
-{
-    using namespace cv::gpu::cudev::fast;
-
-    if (count_ == 0)
-        return 0;
-
-    ensureSizeIsEnough(ROWS_COUNT, count_, CV_32FC1, keypoints);
-
-    if (nonmaxSupression)
-        return nonmaxSupression_gpu(kpLoc_.ptr<short2>(), count_, score_, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW));
-
-    GpuMat locRow(1, count_, kpLoc_.type(), keypoints.ptr(0));
-    kpLoc_.colRange(0, count_).copyTo(locRow);
-    keypoints.row(1).setTo(Scalar::all(0));
-
-    return count_;
-}
-
-void cv::gpu::FAST_GPU::release()
-{
-    kpLoc_.release();
-    score_.release();
-
-    d_keypoints_.release();
-}
-
-#endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/ffmpeg_video_source.cpp
+++ b/modules/gpu/src/ffmpeg_video_source.cpp
@@ -1,182 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "ffmpeg_video_source.h"
-
-#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
-
-#if defined(HAVE_FFMPEG) && defined(BUILD_SHARED_LIBS)
-    #include "../src/cap_ffmpeg_impl.hpp"
-#else
-    #include "../src/cap_ffmpeg_api.hpp"
-#endif
-
-namespace
-{
-    Create_InputMediaStream_FFMPEG_Plugin create_InputMediaStream_FFMPEG_p = 0;
-    Release_InputMediaStream_FFMPEG_Plugin release_InputMediaStream_FFMPEG_p = 0;
-    Read_InputMediaStream_FFMPEG_Plugin read_InputMediaStream_FFMPEG_p = 0;
-
-    bool init_MediaStream_FFMPEG()
-    {
-        static bool initialized = 0;
-
-        if (!initialized)
-        {
-            #if defined WIN32 || defined _WIN32
-                const char* module_name = "opencv_ffmpeg"
-                    CVAUX_STR(CV_VERSION_EPOCH) CVAUX_STR(CV_VERSION_MAJOR) CVAUX_STR(CV_VERSION_MINOR)
-                #if (defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__)
-                    "_64"
-                #endif
-                    ".dll";
-
-                static HMODULE cvFFOpenCV = LoadLibrary(module_name);
-
-                if (cvFFOpenCV)
-                {
-                    create_InputMediaStream_FFMPEG_p =
-                        (Create_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "create_InputMediaStream_FFMPEG");
-                    release_InputMediaStream_FFMPEG_p =
-                        (Release_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "release_InputMediaStream_FFMPEG");
-                    read_InputMediaStream_FFMPEG_p =
-                        (Read_InputMediaStream_FFMPEG_Plugin)GetProcAddress(cvFFOpenCV, "read_InputMediaStream_FFMPEG");
-
-                    initialized = create_InputMediaStream_FFMPEG_p != 0 && release_InputMediaStream_FFMPEG_p != 0 && read_InputMediaStream_FFMPEG_p != 0;
-                }
-            #elif defined HAVE_FFMPEG
-                create_InputMediaStream_FFMPEG_p = create_InputMediaStream_FFMPEG;
-                release_InputMediaStream_FFMPEG_p = release_InputMediaStream_FFMPEG;
-                read_InputMediaStream_FFMPEG_p = read_InputMediaStream_FFMPEG;
-
-                initialized = true;
-            #endif
-        }
-
-        return initialized;
-    }
-}
-
-cv::gpu::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname) :
-    stream_(0)
-{
-    CV_Assert( init_MediaStream_FFMPEG() );
-
-    int codec;
-    int chroma_format;
-    int width;
-    int height;
-
-    stream_ = create_InputMediaStream_FFMPEG_p(fname.c_str(), &codec, &chroma_format, &width, &height);
-    if (!stream_)
-        CV_Error(cv::Error::StsUnsupportedFormat, "Unsupported video source");
-
-    format_.codec = static_cast<VideoReader_GPU::Codec>(codec);
-    format_.chromaFormat = static_cast<VideoReader_GPU::ChromaFormat>(chroma_format);
-    format_.width = width;
-    format_.height = height;
-}
-
-cv::gpu::detail::FFmpegVideoSource::~FFmpegVideoSource()
-{
-    release_InputMediaStream_FFMPEG_p(stream_);
-}
-
-cv::gpu::VideoReader_GPU::FormatInfo cv::gpu::detail::FFmpegVideoSource::format() const
-{
-    return format_;
-}
-
-void cv::gpu::detail::FFmpegVideoSource::start()
-{
-    stop_ = false;
-    hasError_ = false;
-    thread_.reset(new Thread(readLoop, this));
-}
-
-void cv::gpu::detail::FFmpegVideoSource::stop()
-{
-    stop_ = true;
-    thread_->wait();
-    thread_.reset();
-}
-
-bool cv::gpu::detail::FFmpegVideoSource::isStarted() const
-{
-    return !stop_;
-}
-
-bool cv::gpu::detail::FFmpegVideoSource::hasError() const
-{
-    return hasError_;
-}
-
-void cv::gpu::detail::FFmpegVideoSource::readLoop(void* userData)
-{
-    FFmpegVideoSource* thiz = static_cast<FFmpegVideoSource*>(userData);
-
-    for (;;)
-    {
-        unsigned char* data;
-        int size;
-        int endOfFile;
-
-        if (!read_InputMediaStream_FFMPEG_p(thiz->stream_, &data, &size, &endOfFile))
-        {
-            thiz->hasError_ = !endOfFile;
-            break;
-        }
-
-        if (!thiz->parseVideoData(data, size))
-        {
-            thiz->hasError_ = true;
-            break;
-        }
-
-        if (thiz->stop_)
-            break;
-    }
-
-    thiz->parseVideoData(0, 0, true);
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/ffmpeg_video_source.h
+++ b/modules/gpu/src/ffmpeg_video_source.h
@@ -1,88 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __FFMPEG_VIDEO_SOURCE_H__
-#define __FFMPEG_VIDEO_SOURCE_H__
-
-#include "precomp.hpp"
-#include "thread_wrappers.h"
-
-#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
-
-struct InputMediaStream_FFMPEG;
-
-namespace cv { namespace gpu
-{
-    namespace detail
-    {
-        class FFmpegVideoSource : public VideoReader_GPU::VideoSource
-        {
-        public:
-            FFmpegVideoSource(const String& fname);
-            ~FFmpegVideoSource();
-
-            VideoReader_GPU::FormatInfo format() const;
-            void start();
-            void stop();
-            bool isStarted() const;
-            bool hasError() const;
-
-        private:
-            FFmpegVideoSource(const FFmpegVideoSource&);
-            FFmpegVideoSource& operator =(const FFmpegVideoSource&);
-
-            VideoReader_GPU::FormatInfo format_;
-
-            InputMediaStream_FFMPEG* stream_;
-
-            std::auto_ptr<Thread> thread_;
-            volatile bool stop_;
-            volatile bool hasError_;
-
-            static void readLoop(void* userData);
-        };
-    }
-}}
-
-#endif // HAVE_CUDA
-
-#endif // __CUVUD_VIDEO_SOURCE_H__
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
@@ -1,753 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
-
-class cv::gpu::FGDStatModel::Impl
-{
-};
-
-cv::gpu::FGDStatModel::Params::Params() { throw_no_cuda(); }
-
-cv::gpu::FGDStatModel::FGDStatModel(int) { throw_no_cuda(); }
-cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat&, const Params&, int) { throw_no_cuda(); }
-cv::gpu::FGDStatModel::~FGDStatModel() {}
-void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat&, const Params&) { throw_no_cuda(); }
-void cv::gpu::FGDStatModel::release() {}
-int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat&) { throw_no_cuda(); return 0; }
-
-#else
-
-#include "fgd_bgfg_common.hpp"
-#include "opencv2/imgproc/imgproc_c.h"
-
-namespace
-{
-    class BGPixelStat
-    {
-    public:
-        void create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn);
-        void release();
-
-        void setTrained();
-
-        operator bgfg::BGPixelStat();
-
-    private:
-        cv::gpu::GpuMat Pbc_;
-        cv::gpu::GpuMat Pbcc_;
-        cv::gpu::GpuMat is_trained_st_model_;
-        cv::gpu::GpuMat is_trained_dyn_model_;
-
-        cv::gpu::GpuMat ctable_Pv_;
-        cv::gpu::GpuMat ctable_Pvb_;
-        cv::gpu::GpuMat ctable_v_;
-
-        cv::gpu::GpuMat cctable_Pv_;
-        cv::gpu::GpuMat cctable_Pvb_;
-        cv::gpu::GpuMat cctable_v1_;
-        cv::gpu::GpuMat cctable_v2_;
-    };
-
-    void BGPixelStat::create(cv::Size size, const cv::gpu::FGDStatModel::Params& params, int out_cn)
-    {
-        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbc_);
-        Pbc_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_32FC1, Pbcc_);
-        Pbcc_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_st_model_);
-        is_trained_st_model_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(size, CV_8UC1, is_trained_dyn_model_);
-        is_trained_dyn_model_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pv_);
-        ctable_Pv_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_32FC1, ctable_Pvb_);
-        ctable_Pvb_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2c * size.height, size.width, CV_8UC(out_cn), ctable_v_);
-        ctable_v_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pv_);
-        cctable_Pv_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_32FC1, cctable_Pvb_);
-        cctable_Pvb_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v1_);
-        cctable_v1_.setTo(cv::Scalar::all(0));
-
-        cv::gpu::ensureSizeIsEnough(params.N2cc * size.height, size.width, CV_8UC(out_cn), cctable_v2_);
-        cctable_v2_.setTo(cv::Scalar::all(0));
-    }
-
-    void BGPixelStat::release()
-    {
-        Pbc_.release();
-        Pbcc_.release();
-        is_trained_st_model_.release();
-        is_trained_dyn_model_.release();
-
-        ctable_Pv_.release();
-        ctable_Pvb_.release();
-        ctable_v_.release();
-
-        cctable_Pv_.release();
-        cctable_Pvb_.release();
-        cctable_v1_.release();
-        cctable_v2_.release();
-    }
-
-    void BGPixelStat::setTrained()
-    {
-        is_trained_st_model_.setTo(cv::Scalar::all(1));
-        is_trained_dyn_model_.setTo(cv::Scalar::all(1));
-    }
-
-    BGPixelStat::operator bgfg::BGPixelStat()
-    {
-        bgfg::BGPixelStat stat;
-
-        stat.rows_ = Pbc_.rows;
-
-        stat.Pbc_data_ = Pbc_.data;
-        stat.Pbc_step_ = Pbc_.step;
-
-        stat.Pbcc_data_ = Pbcc_.data;
-        stat.Pbcc_step_ = Pbcc_.step;
-
-        stat.is_trained_st_model_data_ = is_trained_st_model_.data;
-        stat.is_trained_st_model_step_ = is_trained_st_model_.step;
-
-        stat.is_trained_dyn_model_data_ = is_trained_dyn_model_.data;
-        stat.is_trained_dyn_model_step_ = is_trained_dyn_model_.step;
-
-        stat.ctable_Pv_data_ = ctable_Pv_.data;
-        stat.ctable_Pv_step_ = ctable_Pv_.step;
-
-        stat.ctable_Pvb_data_ = ctable_Pvb_.data;
-        stat.ctable_Pvb_step_ = ctable_Pvb_.step;
-
-        stat.ctable_v_data_ = ctable_v_.data;
-        stat.ctable_v_step_ = ctable_v_.step;
-
-        stat.cctable_Pv_data_ = cctable_Pv_.data;
-        stat.cctable_Pv_step_ = cctable_Pv_.step;
-
-        stat.cctable_Pvb_data_ = cctable_Pvb_.data;
-        stat.cctable_Pvb_step_ = cctable_Pvb_.step;
-
-        stat.cctable_v1_data_ = cctable_v1_.data;
-        stat.cctable_v1_step_ = cctable_v1_.step;
-
-        stat.cctable_v2_data_ = cctable_v2_.data;
-        stat.cctable_v2_step_ = cctable_v2_.step;
-
-        return stat;
-    }
-}
-
-class cv::gpu::FGDStatModel::Impl
-{
-public:
-    Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn);
-    ~Impl();
-
-    void create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params);
-    void release();
-
-    int update(const cv::gpu::GpuMat& curFrame);
-
-private:
-    Impl(const Impl&);
-    Impl& operator=(const Impl&);
-
-    int out_cn_;
-
-    cv::gpu::FGDStatModel::Params params_;
-
-    cv::gpu::GpuMat& background_;
-    cv::gpu::GpuMat& foreground_;
-    std::vector< std::vector<cv::Point> >& foreground_regions_;
-
-    cv::Mat h_foreground_;
-
-    cv::gpu::GpuMat prevFrame_;
-    cv::gpu::GpuMat Ftd_;
-    cv::gpu::GpuMat Fbd_;
-    BGPixelStat stat_;
-
-    cv::gpu::GpuMat hist_;
-    cv::gpu::GpuMat histBuf_;
-
-    cv::gpu::GpuMat countBuf_;
-
-    cv::gpu::GpuMat buf_;
-    cv::gpu::GpuMat filterBuf_;
-    cv::gpu::GpuMat filterBrd_;
-
-    cv::Ptr<cv::gpu::FilterEngine_GPU> dilateFilter_;
-    cv::Ptr<cv::gpu::FilterEngine_GPU> erodeFilter_;
-
-    CvMemStorage* storage_;
-};
-
-cv::gpu::FGDStatModel::Impl::Impl(cv::gpu::GpuMat& background, cv::gpu::GpuMat& foreground, std::vector< std::vector<cv::Point> >& foreground_regions, int out_cn) :
-    out_cn_(out_cn), background_(background), foreground_(foreground), foreground_regions_(foreground_regions)
-{
-    CV_Assert( out_cn_ == 3 || out_cn_ == 4 );
-
-    storage_ = cvCreateMemStorage();
-    CV_Assert( storage_ != 0 );
-}
-
-cv::gpu::FGDStatModel::Impl::~Impl()
-{
-    cvReleaseMemStorage(&storage_);
-}
-
-namespace
-{
-    void copyChannels(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, int dst_cn = -1)
-    {
-        const int src_cn = src.channels();
-
-        if (dst_cn < 0)
-            dst_cn = src_cn;
-
-        cv::gpu::ensureSizeIsEnough(src.size(), CV_MAKE_TYPE(src.depth(), dst_cn), dst);
-
-        if (src_cn == dst_cn)
-            src.copyTo(dst);
-        else
-        {
-            static const int cvt_codes[4][4] =
-            {
-                {-1, -1, cv::COLOR_GRAY2BGR, cv::COLOR_GRAY2BGRA},
-                {-1, -1, -1, -1},
-                {cv::COLOR_BGR2GRAY, -1, -1, cv::COLOR_BGR2BGRA},
-                {cv::COLOR_BGRA2GRAY, -1, cv::COLOR_BGRA2BGR, -1}
-            };
-
-            const int cvt_code = cvt_codes[src_cn - 1][dst_cn - 1];
-            CV_DbgAssert( cvt_code >= 0 );
-
-            cv::gpu::cvtColor(src, dst, cvt_code, dst_cn);
-        }
-    }
-}
-
-void cv::gpu::FGDStatModel::Impl::create(const cv::gpu::GpuMat& firstFrame, const cv::gpu::FGDStatModel::Params& params)
-{
-    CV_Assert(firstFrame.type() == CV_8UC3 || firstFrame.type() == CV_8UC4);
-
-    params_ = params;
-
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, foreground_);
-
-    copyChannels(firstFrame, background_, out_cn_);
-
-    copyChannels(firstFrame, prevFrame_);
-
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Ftd_);
-    cv::gpu::ensureSizeIsEnough(firstFrame.size(), CV_8UC1, Fbd_);
-
-    stat_.create(firstFrame.size(), params_, out_cn_);
-    bgfg::setBGPixelStat(stat_);
-
-    if (params_.perform_morphing > 0)
-    {
-        cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(1 + params_.perform_morphing * 2, 1 + params_.perform_morphing * 2));
-        cv::Point anchor(params_.perform_morphing, params_.perform_morphing);
-
-        dilateFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_DILATE, CV_8UC1, kernel, filterBuf_, anchor);
-        erodeFilter_ = cv::gpu::createMorphologyFilter_GPU(cv::MORPH_ERODE, CV_8UC1, kernel, filterBuf_, anchor);
-    }
-}
-
-void cv::gpu::FGDStatModel::Impl::release()
-{
-    background_.release();
-    foreground_.release();
-
-    prevFrame_.release();
-    Ftd_.release();
-    Fbd_.release();
-    stat_.release();
-
-    hist_.release();
-    histBuf_.release();
-
-    countBuf_.release();
-
-    buf_.release();
-    filterBuf_.release();
-    filterBrd_.release();
-}
-
-/////////////////////////////////////////////////////////////////////////
-// changeDetection
-
-namespace
-{
-    void calcDiffHistogram(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, unsigned int* hist0, unsigned int* hist1, unsigned int* hist2, unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2, bool cc20, cudaStream_t stream);
-        static const func_t funcs[4][4] =
-        {
-            {0,0,0,0},
-            {0,0,0,0},
-            {0,0,bgfg::calcDiffHistogram_gpu<uchar3, uchar3>,bgfg::calcDiffHistogram_gpu<uchar3, uchar4>},
-            {0,0,bgfg::calcDiffHistogram_gpu<uchar4, uchar3>,bgfg::calcDiffHistogram_gpu<uchar4, uchar4>}
-        };
-
-        hist.create(3, 256, CV_32SC1);
-        histBuf.create(3, bgfg::PARTIAL_HISTOGRAM_COUNT * bgfg::HISTOGRAM_BIN_COUNT, CV_32SC1);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](
-                    prevFrame, curFrame,
-                    hist.ptr<unsigned int>(0), hist.ptr<unsigned int>(1), hist.ptr<unsigned int>(2),
-                    histBuf.ptr<unsigned int>(0), histBuf.ptr<unsigned int>(1), histBuf.ptr<unsigned int>(2),
-                    cv::gpu::deviceSupports(cv::gpu::FEATURE_SET_COMPUTE_20), 0);
-    }
-
-    void calcRelativeVariance(unsigned int hist[3 * 256], double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT])
-    {
-        std::memset(relativeVariance, 0, 3 * bgfg::HISTOGRAM_BIN_COUNT * sizeof(double));
-
-        for (int thres = bgfg::HISTOGRAM_BIN_COUNT - 2; thres >= 0; --thres)
-        {
-            cv::Vec3d sum(0.0, 0.0, 0.0);
-            cv::Vec3d sqsum(0.0, 0.0, 0.0);
-            cv::Vec3i count(0, 0, 0);
-
-            for (int j = thres; j < bgfg::HISTOGRAM_BIN_COUNT; ++j)
-            {
-                sum[0]   += static_cast<double>(j) * hist[j];
-                sqsum[0] += static_cast<double>(j * j) * hist[j];
-                count[0] += hist[j];
-
-                sum[1]   += static_cast<double>(j) * hist[j + 256];
-                sqsum[1] += static_cast<double>(j * j) * hist[j + 256];
-                count[1] += hist[j + 256];
-
-                sum[2]   += static_cast<double>(j) * hist[j + 512];
-                sqsum[2] += static_cast<double>(j * j) * hist[j + 512];
-                count[2] += hist[j + 512];
-            }
-
-            count[0] = std::max(count[0], 1);
-            count[1] = std::max(count[1], 1);
-            count[2] = std::max(count[2], 1);
-
-            cv::Vec3d my(
-                sum[0] / count[0],
-                sum[1] / count[1],
-                sum[2] / count[2]
-            );
-
-            relativeVariance[0][thres] = std::sqrt(sqsum[0] / count[0] - my[0] * my[0]);
-            relativeVariance[1][thres] = std::sqrt(sqsum[1] / count[1] - my[1] * my[1]);
-            relativeVariance[2][thres] = std::sqrt(sqsum[2] / count[2] - my[2] * my[2]);
-        }
-    }
-
-    void calcDiffThreshMask(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::Vec3d bestThres, cv::gpu::GpuMat& changeMask)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
-        static const func_t funcs[4][4] =
-        {
-            {0,0,0,0},
-            {0,0,0,0},
-            {0,0,bgfg::calcDiffThreshMask_gpu<uchar3, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar3, uchar4>},
-            {0,0,bgfg::calcDiffThreshMask_gpu<uchar4, uchar3>,bgfg::calcDiffThreshMask_gpu<uchar4, uchar4>}
-        };
-
-        changeMask.setTo(cv::Scalar::all(0));
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1](prevFrame, curFrame, make_uchar3((uchar)bestThres[0], (uchar)bestThres[1], (uchar)bestThres[2]), changeMask, 0);
-    }
-
-    // performs change detection for Foreground detection algorithm
-    void changeDetection(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, cv::gpu::GpuMat& changeMask, cv::gpu::GpuMat& hist, cv::gpu::GpuMat& histBuf)
-    {
-        calcDiffHistogram(prevFrame, curFrame, hist, histBuf);
-
-        unsigned int histData[3 * 256];
-        cv::Mat h_hist(3, 256, CV_32SC1, histData);
-        hist.download(h_hist);
-
-        double relativeVariance[3][bgfg::HISTOGRAM_BIN_COUNT];
-        calcRelativeVariance(histData, relativeVariance);
-
-        // Find maximum:
-        cv::Vec3d bestThres(10.0, 10.0, 10.0);
-        for (int i = 0; i < bgfg::HISTOGRAM_BIN_COUNT; ++i)
-        {
-            bestThres[0] = std::max(bestThres[0], relativeVariance[0][i]);
-            bestThres[1] = std::max(bestThres[1], relativeVariance[1][i]);
-            bestThres[2] = std::max(bestThres[2], relativeVariance[2][i]);
-        }
-
-        calcDiffThreshMask(prevFrame, curFrame, bestThres, changeMask);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// bgfgClassification
-
-namespace
-{
-    int bgfgClassification(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame,
-                           const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
-                           cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& countBuf,
-                           const cv::gpu::FGDStatModel::Params& params, int out_cn)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
-                               int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
-        static const func_t funcs[4][4][4] =
-        {
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar3, uchar4>},
-                {0,0,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar3, uchar4, uchar4>}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar3, uchar4>},
-                {0,0,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar3>,bgfg::bgfgClassification_gpu<uchar4, uchar4, uchar4>}
-            }
-        };
-
-        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
-        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][out_cn - 1](prevFrame, curFrame, Ftd, Fbd, foreground, deltaC, deltaCC, params.alpha2, params.N1c, params.N1cc, 0);
-
-        int count = cv::gpu::countNonZero(foreground, countBuf);
-
-        cv::gpu::multiply(foreground, cv::Scalar::all(255), foreground);
-
-        return count;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// smoothForeground
-
-namespace
-{
-    void morphology(const cv::gpu::GpuMat& src, cv::gpu::GpuMat& dst, cv::gpu::GpuMat& filterBrd, int brd, cv::Ptr<cv::gpu::FilterEngine_GPU>& filter, cv::Scalar brdVal)
-    {
-        cv::gpu::copyMakeBorder(src, filterBrd, brd, brd, brd, brd, cv::BORDER_CONSTANT, brdVal);
-        filter->apply(filterBrd(cv::Rect(brd, brd, src.cols, src.rows)), dst, cv::Rect(0, 0, src.cols, src.rows));
-    }
-
-    void smoothForeground(cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& filterBrd, cv::gpu::GpuMat& buf,
-                          cv::Ptr<cv::gpu::FilterEngine_GPU>& erodeFilter, cv::Ptr<cv::gpu::FilterEngine_GPU>& dilateFilter,
-                          const cv::gpu::FGDStatModel::Params& params)
-    {
-        const int brd = params.perform_morphing;
-
-        const cv::Scalar erodeBrdVal = cv::Scalar::all(UCHAR_MAX);
-        const cv::Scalar dilateBrdVal = cv::Scalar::all(0);
-
-        // MORPH_OPEN
-        morphology(foreground, buf, filterBrd, brd, erodeFilter, erodeBrdVal);
-        morphology(buf, foreground, filterBrd, brd, dilateFilter, dilateBrdVal);
-
-        // MORPH_CLOSE
-        morphology(foreground, buf, filterBrd, brd, dilateFilter, dilateBrdVal);
-        morphology(buf, foreground, filterBrd, brd, erodeFilter, erodeBrdVal);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// findForegroundRegions
-
-namespace
-{
-    void seqToContours(CvSeq* _ccontours, CvMemStorage* storage, cv::OutputArrayOfArrays _contours)
-    {
-        cv::Seq<CvSeq*> all_contours(cvTreeToNodeSeq(_ccontours, sizeof(CvSeq), storage));
-
-        size_t total = all_contours.size();
-
-        _contours.create((int) total, 1, 0, -1, true);
-
-        cv::SeqIterator<CvSeq*> it = all_contours.begin();
-        for (size_t i = 0; i < total; ++i, ++it)
-        {
-            CvSeq* c = *it;
-            ((CvContour*)c)->color = (int)i;
-            _contours.create((int)c->total, 1, CV_32SC2, (int)i, true);
-            cv::Mat ci = _contours.getMat((int)i);
-            CV_Assert( ci.isContinuous() );
-            cvCvtSeqToArray(c, ci.data);
-        }
-    }
-
-    int findForegroundRegions(cv::gpu::GpuMat& d_foreground, cv::Mat& h_foreground, std::vector< std::vector<cv::Point> >& foreground_regions,
-                              CvMemStorage* storage, const cv::gpu::FGDStatModel::Params& params)
-    {
-        int region_count = 0;
-
-        // Discard under-size foreground regions:
-
-        d_foreground.download(h_foreground);
-        IplImage ipl_foreground = h_foreground;
-        CvSeq* first_seq = 0;
-
-        cvFindContours(&ipl_foreground, storage, &first_seq, sizeof(CvContour), CV_RETR_LIST);
-
-        for (CvSeq* seq = first_seq; seq; seq = seq->h_next)
-        {
-            CvContour* cnt = reinterpret_cast<CvContour*>(seq);
-
-            if (cnt->rect.width * cnt->rect.height < params.minArea || (params.is_obj_without_holes && CV_IS_SEQ_HOLE(seq)))
-            {
-                // Delete under-size contour:
-                CvSeq* prev_seq = seq->h_prev;
-                if (prev_seq)
-                {
-                    prev_seq->h_next = seq->h_next;
-
-                    if (seq->h_next)
-                        seq->h_next->h_prev = prev_seq;
-                }
-                else
-                {
-                    first_seq = seq->h_next;
-
-                    if (seq->h_next)
-                        seq->h_next->h_prev = NULL;
-                }
-            }
-            else
-            {
-                region_count++;
-            }
-        }
-
-        seqToContours(first_seq, storage, foreground_regions);
-        h_foreground.setTo(0);
-
-        cv::drawContours(h_foreground, foreground_regions, -1, cv::Scalar::all(255), -1);
-
-        d_foreground.upload(h_foreground);
-
-        return region_count;
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// updateBackgroundModel
-
-namespace
-{
-    void updateBackgroundModel(const cv::gpu::GpuMat& prevFrame, const cv::gpu::GpuMat& curFrame, const cv::gpu::GpuMat& Ftd, const cv::gpu::GpuMat& Fbd,
-                               const cv::gpu::GpuMat& foreground, cv::gpu::GpuMat& background,
-                               const cv::gpu::FGDStatModel::Params& params)
-    {
-        typedef void (*func_t)(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd,
-                               cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
-                               int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T, cudaStream_t stream);
-        static const func_t funcs[4][4][4] =
-        {
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0}, {0,0,0,0}, {0,0,0,0}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar3, uchar4>},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar3, uchar4, uchar4>}
-            },
-            {
-                {0,0,0,0}, {0,0,0,0},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar3, uchar4>},
-                {0,0,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar3>,bgfg::updateBackgroundModel_gpu<uchar4, uchar4, uchar4>}
-            }
-        };
-
-        const int deltaC  = cvRound(params.delta * 256 / params.Lc);
-        const int deltaCC = cvRound(params.delta * 256 / params.Lcc);
-
-        funcs[prevFrame.channels() - 1][curFrame.channels() - 1][background.channels() - 1](
-                    prevFrame, curFrame, Ftd, Fbd, foreground, background,
-                    deltaC, deltaCC, params.alpha1, params.alpha2, params.alpha3, params.N1c, params.N1cc, params.N2c, params.N2cc, params.T,
-                    0);
-    }
-}
-
-/////////////////////////////////////////////////////////////////////////
-// Impl::update
-
-int cv::gpu::FGDStatModel::Impl::update(const cv::gpu::GpuMat& curFrame)
-{
-    CV_Assert(curFrame.type() == CV_8UC3 || curFrame.type() == CV_8UC4);
-    CV_Assert(curFrame.size() == prevFrame_.size());
-
-    cvClearMemStorage(storage_);
-    foreground_regions_.clear();
-    foreground_.setTo(cv::Scalar::all(0));
-
-    changeDetection(prevFrame_, curFrame, Ftd_, hist_, histBuf_);
-    changeDetection(background_, curFrame, Fbd_, hist_, histBuf_);
-
-    int FG_pixels_count = bgfgClassification(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, countBuf_, params_, out_cn_);
-
-    if (params_.perform_morphing > 0)
-        smoothForeground(foreground_, filterBrd_, buf_, erodeFilter_, dilateFilter_, params_);
-
-    int region_count = 0;
-    if (params_.minArea > 0 || params_.is_obj_without_holes)
-        region_count = findForegroundRegions(foreground_, h_foreground_, foreground_regions_, storage_, params_);
-
-    // Check ALL BG update condition:
-    const double BGFG_FGD_BG_UPDATE_TRESH = 0.5;
-    if (static_cast<double>(FG_pixels_count) / Ftd_.size().area() > BGFG_FGD_BG_UPDATE_TRESH)
-        stat_.setTrained();
-
-    updateBackgroundModel(prevFrame_, curFrame, Ftd_, Fbd_, foreground_, background_, params_);
-
-    copyChannels(curFrame, prevFrame_);
-
-    return region_count;
-}
-
-namespace
-{
-    // Default parameters of foreground detection algorithm:
-    const int BGFG_FGD_LC  = 128;
-    const int BGFG_FGD_N1C = 15;
-    const int BGFG_FGD_N2C = 25;
-
-    const int BGFG_FGD_LCC   = 64;
-    const int BGFG_FGD_N1CC = 25;
-    const int BGFG_FGD_N2CC = 40;
-
-    // Background reference image update parameter:
-    const float BGFG_FGD_ALPHA_1 = 0.1f;
-
-    // stat model update parameter
-    // 0.002f ~ 1K frame(~45sec), 0.005 ~ 18sec (if 25fps and absolutely static BG)
-    const float BGFG_FGD_ALPHA_2 = 0.005f;
-
-    // start value for alpha parameter (to fast initiate statistic model)
-    const float BGFG_FGD_ALPHA_3 = 0.1f;
-
-    const float BGFG_FGD_DELTA = 2.0f;
-
-    const float BGFG_FGD_T = 0.9f;
-
-    const float BGFG_FGD_MINAREA= 15.0f;
-}
-
-cv::gpu::FGDStatModel::Params::Params()
-{
-    Lc      = BGFG_FGD_LC;
-    N1c     = BGFG_FGD_N1C;
-    N2c     = BGFG_FGD_N2C;
-
-    Lcc     = BGFG_FGD_LCC;
-    N1cc    = BGFG_FGD_N1CC;
-    N2cc    = BGFG_FGD_N2CC;
-
-    delta   = BGFG_FGD_DELTA;
-
-    alpha1  = BGFG_FGD_ALPHA_1;
-    alpha2  = BGFG_FGD_ALPHA_2;
-    alpha3  = BGFG_FGD_ALPHA_3;
-
-    T       = BGFG_FGD_T;
-    minArea = BGFG_FGD_MINAREA;
-
-    is_obj_without_holes = true;
-    perform_morphing     = 1;
-}
-
-cv::gpu::FGDStatModel::FGDStatModel(int out_cn)
-{
-    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
-}
-
-cv::gpu::FGDStatModel::FGDStatModel(const cv::gpu::GpuMat& firstFrame, const Params& params, int out_cn)
-{
-    impl_.reset(new Impl(background, foreground, foreground_regions, out_cn));
-    create(firstFrame, params);
-}
-
-cv::gpu::FGDStatModel::~FGDStatModel()
-{
-}
-
-void cv::gpu::FGDStatModel::create(const cv::gpu::GpuMat& firstFrame, const Params& params)
-{
-    impl_->create(firstFrame, params);
-}
-
-void cv::gpu::FGDStatModel::release()
-{
-    impl_->release();
-}
-
-int cv::gpu::FGDStatModel::update(const cv::gpu::GpuMat& curFrame)
-{
-    return impl_->update(curFrame);
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/frame_queue.cpp
+++ b/modules/gpu/src/frame_queue.cpp
@@ -1,117 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "frame_queue.h"
-
-#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
-
-cv::gpu::detail::FrameQueue::FrameQueue() :
-    endOfDecode_(0),
-    framesInQueue_(0),
-    readPosition_(0)
-{
-    std::memset(displayQueue_, 0, sizeof(displayQueue_));
-    std::memset((void*)isFrameInUse_, 0, sizeof(isFrameInUse_));
-}
-
-bool cv::gpu::detail::FrameQueue::waitUntilFrameAvailable(int pictureIndex)
-{
-    while (isInUse(pictureIndex))
-    {
-        // Decoder is getting too far ahead from display
-        Thread::sleep(1);
-
-        if (isEndOfDecode())
-            return false;
-    }
-
-    return true;
-}
-
-void cv::gpu::detail::FrameQueue::enqueue(const CUVIDPARSERDISPINFO* picParams)
-{
-    // Mark the frame as 'in-use' so we don't re-use it for decoding until it is no longer needed
-    // for display
-    isFrameInUse_[picParams->picture_index] = true;
-
-    // Wait until we have a free entry in the display queue (should never block if we have enough entries)
-    do
-    {
-        bool isFramePlaced = false;
-
-        {
-            CriticalSection::AutoLock autoLock(criticalSection_);
-
-            if (framesInQueue_ < MaximumSize)
-            {
-                int writePosition = (readPosition_ + framesInQueue_) % MaximumSize;
-                displayQueue_[writePosition] = *picParams;
-                framesInQueue_++;
-                isFramePlaced = true;
-            }
-        }
-
-        if (isFramePlaced) // Done
-            break;
-
-        // Wait a bit
-        Thread::sleep(1);
-    } while (!isEndOfDecode());
-}
-
-bool cv::gpu::detail::FrameQueue::dequeue(CUVIDPARSERDISPINFO& displayInfo)
-{
-    CriticalSection::AutoLock autoLock(criticalSection_);
-
-    if (framesInQueue_ > 0)
-    {
-        int entry = readPosition_;
-        displayInfo = displayQueue_[entry];
-        readPosition_ = (entry + 1) % MaximumSize;
-        framesInQueue_--;
-        return true;
-    }
-
-    return false;
-}
-
-#endif // HAVE_CUDA
--- a/modules/gpu/src/frame_queue.h
+++ b/modules/gpu/src/frame_queue.h
@@ -1,103 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __FRAME_QUEUE_H__
-#define __FRAME_QUEUE_H__
-
-#include "precomp.hpp"
-#include "thread_wrappers.h"
-
-#if defined(HAVE_CUDA) && defined(HAVE_NVCUVID)
-
-namespace cv { namespace gpu
-{
-    namespace detail
-    {
-        class FrameQueue
-        {
-        public:
-            static const int MaximumSize = 20; // MAX_FRM_CNT;
-
-            FrameQueue();
-
-            void endDecode() { endOfDecode_ = true; }
-            bool isEndOfDecode() const { return endOfDecode_ != 0;}
-
-            // Spins until frame becomes available or decoding gets canceled.
-            // If the requested frame is available the method returns true.
-            // If decoding was interupted before the requested frame becomes
-            // available, the method returns false.
-            bool waitUntilFrameAvailable(int pictureIndex);
-
-            void enqueue(const CUVIDPARSERDISPINFO* picParams);
-
-            // Deque the next frame.
-            // Parameters:
-            //      displayInfo - New frame info gets placed into this object.
-            // Returns:
-            //      true, if a new frame was returned,
-            //      false, if the queue was empty and no new frame could be returned.
-            bool dequeue(CUVIDPARSERDISPINFO& displayInfo);
-
-            void releaseFrame(const CUVIDPARSERDISPINFO& picParams) { isFrameInUse_[picParams.picture_index] = false; }
-
-        private:
-            FrameQueue(const FrameQueue&);
-            FrameQueue& operator =(const FrameQueue&);
-
-            bool isInUse(int pictureIndex) const { return isFrameInUse_[pictureIndex] != 0; }
-
-            CriticalSection criticalSection_;
-
-            volatile int isFrameInUse_[MaximumSize];
-            volatile int endOfDecode_;
-
-            int framesInQueue_;
-            int readPosition_;
-            CUVIDPARSERDISPINFO displayQueue_[MaximumSize];
-        };
-    }
-}}
-
-#endif // HAVE_CUDA
-
-#endif // __FRAME_QUEUE_H__
--- a/Show More
+++ b/Show More