renamed gpuimgproc -> cudaimgproc

2013-07-23 16:59:34 +04:00
parent 219b662127
commit a0ae602bb7
93 changed files with 54 additions and 54 deletions
--- a/modules/cudaimgproc/src/bilateral_filter.cpp
+++ b/modules/cudaimgproc/src/bilateral_filter.cpp
@@ -0,0 +1,99 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::bilateralFilter(InputArray, OutputArray, int, float, float, int, Stream&) { throw_no_cuda(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::bilateralFilter(InputArray _src, OutputArray _dst, int kernel_size, float sigma_color, float sigma_spatial, int borderMode, Stream& stream)
+{
+    using cv::cuda::device::imgproc::bilateral_filter_gpu;
+
+    typedef void (*func_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, int borderMode, cudaStream_t s);
+
+    static const func_t funcs[6][4] =
+    {
+        {bilateral_filter_gpu<uchar>      , 0 /*bilateral_filter_gpu<uchar2>*/ , bilateral_filter_gpu<uchar3>      , bilateral_filter_gpu<uchar4>      },
+        {0 /*bilateral_filter_gpu<schar>*/, 0 /*bilateral_filter_gpu<schar2>*/ , 0 /*bilateral_filter_gpu<schar3>*/, 0 /*bilateral_filter_gpu<schar4>*/},
+        {bilateral_filter_gpu<ushort>     , 0 /*bilateral_filter_gpu<ushort2>*/, bilateral_filter_gpu<ushort3>     , bilateral_filter_gpu<ushort4>     },
+        {bilateral_filter_gpu<short>      , 0 /*bilateral_filter_gpu<short2>*/ , bilateral_filter_gpu<short3>      , bilateral_filter_gpu<short4>      },
+        {0 /*bilateral_filter_gpu<int>*/  , 0 /*bilateral_filter_gpu<int2>*/   , 0 /*bilateral_filter_gpu<int3>*/  , 0 /*bilateral_filter_gpu<int4>*/  },
+        {bilateral_filter_gpu<float>      , 0 /*bilateral_filter_gpu<float2>*/ , bilateral_filter_gpu<float3>      , bilateral_filter_gpu<float4>      }
+    };
+
+    sigma_color = (sigma_color <= 0 ) ? 1 : sigma_color;
+    sigma_spatial = (sigma_spatial <= 0 ) ? 1 : sigma_spatial;
+
+    int radius = (kernel_size <= 0) ? cvRound(sigma_spatial*1.5) : kernel_size/2;
+    kernel_size = std::max(radius, 1)*2 + 1;
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.depth() <= CV_32F && src.channels() <= 4 );
+    CV_Assert( borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT || borderMode == BORDER_REFLECT || borderMode == BORDER_WRAP );
+
+    const func_t func = funcs[src.depth()][src.channels() - 1];
+    CV_Assert( func != 0 );
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    func(src, dst, kernel_size, sigma_spatial, sigma_color, borderMode, StreamAccessor::getStream(stream));
+}
+
+#endif
--- a/modules/cudaimgproc/src/blend.cpp
+++ b/modules/cudaimgproc/src/blend.cpp
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::blendLinear(InputArray, InputArray, InputArray, InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+#else
+
+////////////////////////////////////////////////////////////////////////
+// blendLinear
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+    }
+}}}
+
+using namespace ::cv::cuda::device::blend;
+
+void cv::cuda::blendLinear(InputArray _img1, InputArray _img2, InputArray _weights1, InputArray _weights2,
+                          OutputArray _result, Stream& stream)
+{
+    GpuMat img1 = _img1.getGpuMat();
+    GpuMat img2 = _img2.getGpuMat();
+
+    GpuMat weights1 = _weights1.getGpuMat();
+    GpuMat weights2 = _weights2.getGpuMat();
+
+    CV_Assert( img1.size() == img2.size() );
+    CV_Assert( img1.type() == img2.type() );
+    CV_Assert( weights1.size() == img1.size() );
+    CV_Assert( weights2.size() == img2.size() );
+    CV_Assert( weights1.type() == CV_32FC1 );
+    CV_Assert( weights2.type() == CV_32FC1 );
+
+    const Size size = img1.size();
+    const int depth = img1.depth();
+    const int cn = img1.channels();
+
+    _result.create(size, CV_MAKE_TYPE(depth, cn));
+    GpuMat result = _result.getGpuMat();
+
+    switch (depth)
+    {
+    case CV_8U:
+        if (cn != 4)
+            blendLinearCaller<uchar>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        else
+            blendLinearCaller8UC4(size.height, size.width, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    case CV_32F:
+        blendLinearCaller<float>(size.height, size.width, cn, img1, img2, weights1, weights2, result, StreamAccessor::getStream(stream));
+        break;
+    default:
+        CV_Error(cv::Error::StsUnsupportedFormat, "bad image depth in linear blending function");
+    }
+}
+
+#endif
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -0,0 +1,234 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double, double, int, bool) { throw_no_cuda(); return Ptr<CannyEdgeDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace canny
+{
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad);
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh);
+
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1);
+
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2);
+
+    void getEdges(PtrStepSzi map, PtrStepSzb dst);
+}
+
+namespace
+{
+    class CannyImpl : public CannyEdgeDetector
+    {
+    public:
+        CannyImpl(double low_thresh, double high_thresh, int apperture_size, bool L2gradient) :
+            low_thresh_(low_thresh), high_thresh_(high_thresh), apperture_size_(apperture_size), L2gradient_(L2gradient)
+        {
+            old_apperture_size_ = -1;
+        }
+
+        void detect(InputArray image, OutputArray edges);
+        void detect(InputArray dx, InputArray dy, OutputArray edges);
+
+        void setLowThreshold(double low_thresh) { low_thresh_ = low_thresh; }
+        double getLowThreshold() const { return low_thresh_; }
+
+        void setHighThreshold(double high_thresh) { high_thresh_ = high_thresh; }
+        double getHighThreshold() const { return high_thresh_; }
+
+        void setAppertureSize(int apperture_size) { apperture_size_ = apperture_size; }
+        int getAppertureSize() const { return apperture_size_; }
+
+        void setL2Gradient(bool L2gradient) { L2gradient_ = L2gradient; }
+        bool getL2Gradient() const { return L2gradient_; }
+
+        void write(FileStorage& fs) const
+        {
+            fs << "name" << "Canny_GPU"
+            << "low_thresh" << low_thresh_
+            << "high_thresh" << high_thresh_
+            << "apperture_size" << apperture_size_
+            << "L2gradient" << L2gradient_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "Canny_GPU" );
+            low_thresh_ = (double)fn["low_thresh"];
+            high_thresh_ = (double)fn["high_thresh"];
+            apperture_size_ = (int)fn["apperture_size"];
+            L2gradient_ = (int)fn["L2gradient"] != 0;
+        }
+
+    private:
+        void createBuf(Size image_size);
+        void CannyCaller(GpuMat& edges);
+
+        double low_thresh_;
+        double high_thresh_;
+        int apperture_size_;
+        bool L2gradient_;
+
+        GpuMat dx_, dy_;
+        GpuMat mag_;
+        GpuMat map_;
+        GpuMat st1_, st2_;
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        Ptr<Filter> filterDX_, filterDY_;
+#endif
+        int old_apperture_size_;
+    };
+
+    void CannyImpl::detect(InputArray _image, OutputArray _edges)
+    {
+        GpuMat image = _image.getGpuMat();
+
+        CV_Assert( image.type() == CV_8UC1 );
+        CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+        if (low_thresh_ > high_thresh_)
+            std::swap(low_thresh_, high_thresh_);
+
+        createBuf(image.size());
+
+        _edges.create(image.size(), CV_8UC1);
+        GpuMat edges = _edges.getGpuMat();
+
+        if (apperture_size_ == 3)
+        {
+            Size wholeSize;
+            Point ofs;
+            image.locateROI(wholeSize, ofs);
+            GpuMat srcWhole(wholeSize, image.type(), image.datastart, image.step);
+
+            canny::calcMagnitude(srcWhole, ofs.x, ofs.y, dx_, dy_, mag_, L2gradient_);
+        }
+        else
+        {
+#ifndef HAVE_OPENCV_CUDAFILTERS
+            throw_no_cuda();
+#else
+            filterDX_->apply(image, dx_);
+            filterDY_->apply(image, dy_);
+
+            canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+#endif
+        }
+
+        CannyCaller(edges);
+    }
+
+    void CannyImpl::detect(InputArray _dx, InputArray _dy, OutputArray _edges)
+    {
+        GpuMat dx = _dx.getGpuMat();
+        GpuMat dy = _dy.getGpuMat();
+
+        CV_Assert( dx.type() == CV_32SC1 );
+        CV_Assert( dy.type() == dx.type() && dy.size() == dx.size() );
+        CV_Assert( deviceSupports(SHARED_ATOMICS) );
+
+        dx.copyTo(dx_);
+        dy.copyTo(dy_);
+
+        if (low_thresh_ > high_thresh_)
+            std::swap(low_thresh_, high_thresh_);
+
+        createBuf(dx.size());
+
+        _edges.create(dx.size(), CV_8UC1);
+        GpuMat edges = _edges.getGpuMat();
+
+        canny::calcMagnitude(dx_, dy_, mag_, L2gradient_);
+
+        CannyCaller(edges);
+    }
+
+    void CannyImpl::createBuf(Size image_size)
+    {
+        ensureSizeIsEnough(image_size, CV_32SC1, dx_);
+        ensureSizeIsEnough(image_size, CV_32SC1, dy_);
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        if (apperture_size_ != 3 && apperture_size_ != old_apperture_size_)
+        {
+            filterDX_ = cuda::createDerivFilter(CV_8UC1, CV_32S, 1, 0, apperture_size_, false, 1, BORDER_REPLICATE);
+            filterDY_ = cuda::createDerivFilter(CV_8UC1, CV_32S, 0, 1, apperture_size_, false, 1, BORDER_REPLICATE);
+            old_apperture_size_ = apperture_size_;
+        }
+#endif
+
+        ensureSizeIsEnough(image_size, CV_32FC1, mag_);
+        ensureSizeIsEnough(image_size, CV_32SC1, map_);
+
+        ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st1_);
+        ensureSizeIsEnough(1, image_size.area(), CV_16UC2, st2_);
+    }
+
+    void CannyImpl::CannyCaller(GpuMat& edges)
+    {
+        map_.setTo(Scalar::all(0));
+        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_));
+
+        canny::edgesHysteresisLocal(map_, st1_.ptr<ushort2>());
+
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<ushort2>(), st2_.ptr<ushort2>());
+
+        canny::getEdges(map_, edges);
+    }
+}
+
+Ptr<CannyEdgeDetector> cv::cuda::createCannyEdgeDetector(double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
+{
+    return new CannyImpl(low_thresh, high_thresh, apperture_size, L2gradient);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/color.cpp
+++ b/modules/cudaimgproc/src/color.cpp
--- a/modules/cudaimgproc/src/corners.cpp
+++ b/modules/cudaimgproc/src/corners.cpp
@@ -0,0 +1,189 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAFILTERS)
+
+Ptr<cuda::CornernessCriteria> cv::cuda::createHarrisCorner(int, int, int, double, int) { throw_no_cuda(); return Ptr<cuda::CornernessCriteria>(); }
+Ptr<cuda::CornernessCriteria> cv::cuda::createMinEigenValCorner(int, int, int, int) { throw_no_cuda(); return Ptr<cuda::CornernessCriteria>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    class CornerBase : public CornernessCriteria
+    {
+    protected:
+        CornerBase(int srcType, int blockSize, int ksize, int borderType);
+
+        void extractCovData(const GpuMat& src, Stream& stream);
+
+        int srcType_;
+        int blockSize_;
+        int ksize_;
+        int borderType_;
+        GpuMat Dx_, Dy_;
+
+    private:
+        Ptr<cuda::Filter> filterDx_, filterDy_;
+    };
+
+    CornerBase::CornerBase(int srcType, int blockSize, int ksize, int borderType) :
+        srcType_(srcType), blockSize_(blockSize), ksize_(ksize), borderType_(borderType)
+    {
+        CV_Assert( borderType_ == BORDER_REFLECT101 || borderType_ == BORDER_REPLICATE || borderType_ == BORDER_REFLECT );
+
+        const int sdepth = CV_MAT_DEPTH(srcType_);
+        const int cn = CV_MAT_CN(srcType_);
+
+        CV_Assert( cn == 1 );
+
+        double scale = static_cast<double>(1 << ((ksize_ > 0 ? ksize_ : 3) - 1)) * blockSize_;
+
+        if (ksize_ < 0)
+            scale *= 2.;
+
+        if (sdepth == CV_8U)
+            scale *= 255.;
+
+        scale = 1./scale;
+
+        if (ksize_ > 0)
+        {
+            filterDx_ = cuda::createSobelFilter(srcType, CV_32F, 1, 0, ksize_, scale, borderType_);
+            filterDy_ = cuda::createSobelFilter(srcType, CV_32F, 0, 1, ksize_, scale, borderType_);
+        }
+        else
+        {
+            filterDx_ = cuda::createScharrFilter(srcType, CV_32F, 1, 0, scale, borderType_);
+            filterDy_ = cuda::createScharrFilter(srcType, CV_32F, 0, 1, scale, borderType_);
+        }
+    }
+
+    void CornerBase::extractCovData(const GpuMat& src, Stream& stream)
+    {
+        CV_Assert( src.type() == srcType_ );
+        filterDx_->apply(src, Dx_, stream);
+        filterDy_->apply(src, Dy_, stream);
+    }
+
+    class Harris : public CornerBase
+    {
+    public:
+        Harris(int srcType, int blockSize, int ksize, double k, int borderType) :
+            CornerBase(srcType, blockSize, ksize, borderType), k_(static_cast<float>(k))
+        {
+        }
+
+        void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        float k_;
+    };
+
+    void Harris::compute(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        GpuMat src = _src.getGpuMat();
+
+        extractCovData(src, stream);
+
+        _dst.create(src.size(), CV_32FC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cornerHarris_gpu(blockSize_, k_, Dx_, Dy_, dst, borderType_, StreamAccessor::getStream(stream));
+    }
+
+    class MinEigenVal : public CornerBase
+    {
+    public:
+        MinEigenVal(int srcType, int blockSize, int ksize, int borderType) :
+            CornerBase(srcType, blockSize, ksize, borderType)
+        {
+        }
+
+        void compute(InputArray src, OutputArray dst, Stream& stream = Stream::Null());
+
+    private:
+        float k_;
+    };
+
+    void MinEigenVal::compute(InputArray _src, OutputArray _dst, Stream& stream)
+    {
+        using namespace cv::cuda::device::imgproc;
+
+        GpuMat src = _src.getGpuMat();
+
+        extractCovData(src, stream);
+
+        _dst.create(src.size(), CV_32FC1);
+        GpuMat dst = _dst.getGpuMat();
+
+        cornerMinEigenVal_gpu(blockSize_, Dx_, Dy_, dst, borderType_, StreamAccessor::getStream(stream));
+    }
+}
+
+Ptr<cuda::CornernessCriteria> cv::cuda::createHarrisCorner(int srcType, int blockSize, int ksize, double k, int borderType)
+{
+    return new Harris(srcType, blockSize, ksize, k, borderType);
+}
+
+Ptr<cuda::CornernessCriteria> cv::cuda::createMinEigenValCorner(int srcType, int blockSize, int ksize, int borderType)
+{
+    return new MinEigenVal(srcType, blockSize, ksize, borderType);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/cuda/bilateral_filter.cu
+++ b/modules/cudaimgproc/src/cuda/bilateral_filter.cu
@@ -0,0 +1,199 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+using namespace cv::cuda;
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+
+//////////////////////////////////////////////////////////////////////////////////
+/// Bilateral filtering
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        __device__ __forceinline__ float norm_l1(const float& a)  { return ::fabs(a); }
+        __device__ __forceinline__ float norm_l1(const float2& a) { return ::fabs(a.x) + ::fabs(a.y); }
+        __device__ __forceinline__ float norm_l1(const float3& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z); }
+        __device__ __forceinline__ float norm_l1(const float4& a) { return ::fabs(a.x) + ::fabs(a.y) + ::fabs(a.z) + ::fabs(a.w); }
+
+        __device__ __forceinline__ float sqr(const float& a)  { return a * a; }
+
+        template<typename T, typename B>
+        __global__ void bilateral_kernel(const PtrStepSz<T> src, PtrStep<T> dst, const B b, const int ksz, const float sigma_spatial2_inv_half, const float sigma_color2_inv_half)
+        {
+            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            value_type center = saturate_cast<value_type>(src(y, x));
+
+            value_type sum1 = VecTraits<value_type>::all(0);
+            float sum2 = 0;
+
+            int r = ksz / 2;
+            float r2 = (float)(r * r);
+
+            int tx = x - r + ksz;
+            int ty = y - r + ksz;
+
+            if (x - ksz/2 >=0 && y - ksz/2 >=0 && tx < src.cols && ty < src.rows)
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(src(cy, cx));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            else
+            {
+                for (int cy = y - r; cy < ty; ++cy)
+                    for (int cx = x - r; cx < tx; ++cx)
+                    {
+                        float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                        if (space2 > r2)
+                            continue;
+
+                        value_type value = saturate_cast<value_type>(b.at(cy, cx, src.data, src.step));
+
+                        float weight = ::exp(space2 * sigma_spatial2_inv_half + sqr(norm_l1(value - center)) * sigma_color2_inv_half);
+
+                        sum1 = sum1 + weight * value;
+                        sum2 = sum2 + weight;
+                    }
+            }
+            dst(y, x) = saturate_cast<T>(sum1 / sum2);
+        }
+
+        template<typename T, template <typename> class B>
+        void bilateral_caller(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream)
+        {
+            dim3 block (32, 8);
+            dim3 grid (divUp (src.cols, block.x), divUp (src.rows, block.y));
+
+            B<T> b(src.rows, src.cols);
+
+            float sigma_spatial2_inv_half = -0.5f/(sigma_spatial * sigma_spatial);
+            float sigma_color2_inv_half = -0.5f/(sigma_color * sigma_color);
+
+            cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel<T, B<T> >, cudaFuncCachePreferL1) );
+            bilateral_kernel<<<grid, block>>>((PtrStepSz<T>)src, (PtrStepSz<T>)dst, b, kernel_size, sigma_spatial2_inv_half, sigma_color2_inv_half);
+            cudaSafeCall ( cudaGetLastError () );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template<typename T>
+        void bilateral_filter_gpu(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float gauss_spatial_coeff, float gauss_color_coeff, int borderMode, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb& src, PtrStepSzb dst, int kernel_size, float sigma_spatial, float sigma_color, cudaStream_t stream);
+
+            static caller_t funcs[] =
+            {
+                bilateral_caller<T, BrdConstant>,
+                bilateral_caller<T, BrdReplicate>,
+                bilateral_caller<T, BrdReflect>,
+                bilateral_caller<T, BrdWrap>,
+                bilateral_caller<T, BrdReflect101>
+            };
+            funcs[borderMode](src, dst, kernel_size, gauss_spatial_coeff, gauss_color_coeff, stream);
+        }
+    }
+}}}
+
+
+#define OCV_INSTANTIATE_BILATERAL_FILTER(T) \
+    template void cv::cuda::device::imgproc::bilateral_filter_gpu<T>(const PtrStepSzb&, PtrStepSzb, int, float, float, int, cudaStream_t);
+
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(uchar2)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar3)
+OCV_INSTANTIATE_BILATERAL_FILTER(uchar4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(schar4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(short)
+//OCV_INSTANTIATE_BILATERAL_FILTER(short2)
+OCV_INSTANTIATE_BILATERAL_FILTER(short3)
+OCV_INSTANTIATE_BILATERAL_FILTER(short4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort)
+//OCV_INSTANTIATE_BILATERAL_FILTER(ushort2)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort3)
+OCV_INSTANTIATE_BILATERAL_FILTER(ushort4)
+
+//OCV_INSTANTIATE_BILATERAL_FILTER(int)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int2)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int3)
+//OCV_INSTANTIATE_BILATERAL_FILTER(int4)
+
+OCV_INSTANTIATE_BILATERAL_FILTER(float)
+//OCV_INSTANTIATE_BILATERAL_FILTER(float2)
+OCV_INSTANTIATE_BILATERAL_FILTER(float3)
+OCV_INSTANTIATE_BILATERAL_FILTER(float4)
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/blend.cu
+++ b/modules/cudaimgproc/src/cuda/blend.cu
@@ -0,0 +1,121 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace blend
+    {
+        template <typename T>
+        __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                          const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                int x_ = x / cn;
+                float w1 = weights1.ptr(y)[x_];
+                float w2 = weights2.ptr(y)[x_];
+                T p1 = img1.ptr(y)[x];
+                T p2 = img2.ptr(y)[x];
+                result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
+            }
+        }
+
+        template <typename T>
+        void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+
+        template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+        template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+        __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                              const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (y < rows && x < cols)
+            {
+                float w1 = weights1.ptr(y)[x];
+                float w2 = weights2.ptr(y)[x];
+                float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+                w1 *= sum_inv;
+                w2 *= sum_inv;
+                uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+                uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+                ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                          p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
+            }
+        }
+
+        void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+        {
+            dim3 threads(16, 16);
+            dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
+
+            blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    } // namespace blend
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/build_point_list.cu
+++ b/modules/cudaimgproc/src/cuda/build_point_list.cu
@@ -0,0 +1,138 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        __device__ int g_counter;
+
+        template <int PIXELS_PER_THREAD>
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
+        {
+            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_qsize[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_qsize[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < src.rows)
+            {
+                // fill the queue
+                const uchar* srcRow = src.ptr(y);
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < src.cols; ++i, xx += blockDim.x)
+                {
+                    if (srcRow[xx])
+                    {
+                        const unsigned int val = (y << 16) | xx;
+                        const int qidx = Emulation::smem::atomicAdd(&s_qsize[threadIdx.y], 1);
+                        s_queues[threadIdx.y][qidx] = val;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_qsize[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_qsize[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+                list[gidx] = s_queues[threadIdx.y][i];
+        }
+
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
+        {
+            const int PIXELS_PER_THREAD = 16;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(src.cols, block.x * PIXELS_PER_THREAD), divUp(src.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildPointList<PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildPointList<PIXELS_PER_THREAD><<<grid, block>>>(src, list);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@@ -0,0 +1,494 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <utility>
+#include <algorithm>
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace canny
+{
+    struct L1 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::abs(x) + ::abs(y);
+        }
+
+        __host__ __device__ __forceinline__ L1() {}
+        __host__ __device__ __forceinline__ L1(const L1&) {}
+    };
+    struct L2 : binary_function<int, int, float>
+    {
+        __device__ __forceinline__ float operator ()(int x, int y) const
+        {
+            return ::sqrtf(x * x + y * y);
+        }
+
+        __host__ __device__ __forceinline__ L2() {}
+        __host__ __device__ __forceinline__ L2(const L2&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
+    {
+        enum { smart_shift = 4 };
+    };
+    template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
+    struct SrcTex
+    {
+        int xoff;
+        int yoff;
+        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
+
+        __device__ __forceinline__ int operator ()(int y, int x) const
+        {
+            return tex2D(tex_src, x + xoff, y + yoff);
+        }
+    };
+
+    template <class Norm> __global__
+    void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (y >= mag.rows || x >= mag.cols)
+            return;
+
+        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
+        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+
+        dx(y, x) = dxVal;
+        dy(y, x) = dyVal;
+
+        mag(y, x) = norm(dxVal, dyVal);
+    }
+
+    void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
+
+        bindTexture(&tex_src, srcWhole);
+        SrcTex src(xoff, yoff);
+
+        if (L2Grad)
+        {
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+        else
+        {
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block>>>(src, dx, dy, mag, norm);
+        }
+
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall(cudaThreadSynchronize());
+    }
+
+    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad)
+    {
+        if (L2Grad)
+        {
+            L2 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+        else
+        {
+            L1 norm;
+            transform(dx, dy, mag, norm, WithOutMask(), 0);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    {
+        const int CANNY_SHIFT = 15;
+        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
+            return;
+
+        int dxVal = dx(y, x);
+        int dyVal = dy(y, x);
+
+        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
+        const float m = tex2D(tex_mag, x, y);
+
+        dxVal = ::abs(dxVal);
+        dyVal = ::abs(dyVal);
+
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;
+
+        if (m > low_thresh)
+        {
+            const int tg22x = dxVal * TG22;
+            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
+
+            dyVal <<= CANNY_SHIFT;
+
+            if (dyVal < tg22x)
+            {
+                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if(dyVal > tg67x)
+            {
+                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+        }
+
+        map(y, x) = edge_type;
+    }
+
+    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh)
+    {
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
+
+        bindTexture(&tex_mag, mag);
+
+        calcMapKernel<<<grid, block>>>(dx, dy, map, low_thresh, high_thresh);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __device__ int counter = 0;
+
+    __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, ushort2* st)
+    {
+        __shared__ volatile int smem[18][18];
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        smem[threadIdx.y + 1][threadIdx.x + 1] = x < map.cols && y < map.rows ? map(y, x) : 0;
+        if (threadIdx.y == 0)
+            smem[0][threadIdx.x + 1] = y > 0 ? map(y - 1, x) : 0;
+        if (threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][threadIdx.x + 1] = y + 1 < map.rows ? map(y + 1, x) : 0;
+        if (threadIdx.x == 0)
+            smem[threadIdx.y + 1][0] = x > 0 ? map(y, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1)
+            smem[threadIdx.y + 1][blockDim.x + 1] = x + 1 < map.cols ? map(y, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == 0)
+            smem[0][0] = y > 0 && x > 0 ? map(y - 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
+            smem[0][blockDim.x + 1] = y > 0 && x + 1 < map.cols ? map(y - 1, x + 1) : 0;
+        if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][0] = y + 1 < map.rows && x > 0 ? map(y + 1, x - 1) : 0;
+        if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
+            smem[blockDim.y + 1][blockDim.x + 1] = y + 1 < map.rows && x + 1 < map.cols ? map(y + 1, x + 1) : 0;
+
+        __syncthreads();
+
+        if (x >= map.cols || y >= map.rows)
+            return;
+
+        int n;
+
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
+        {
+            n = 0;
+
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
+            {
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
+
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
+            }
+
+            if (n > 0)
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
+        }
+
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
+
+        map(y, x) = e;
+
+        n = 0;
+
+        if (e == 2)
+        {
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+        }
+
+        if (n > 0)
+        {
+            const int ind =  ::atomicAdd(&counter, 1);
+            st[ind] = make_ushort2(x, y);
+        }
+    }
+
+    void edgesHysteresisLocal(PtrStepSzi map, ushort2* st1)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+        const dim3 block(16, 16);
+        const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
+
+        edgesHysteresisLocalKernel<<<grid, block>>>(map, st1);
+        cudaSafeCall( cudaGetLastError() );
+
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+    __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, ushort2* st1, ushort2* st2, const int count)
+    {
+        const int stack_size = 512;
+
+        __shared__ int s_counter;
+        __shared__ int s_ind;
+        __shared__ ushort2 s_st[stack_size];
+
+        if (threadIdx.x == 0)
+            s_counter = 0;
+
+        __syncthreads();
+
+        int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+        if (ind >= count)
+            return;
+
+        ushort2 pos = st1[ind];
+
+        if (threadIdx.x < 8)
+        {
+            pos.x += c_dx[threadIdx.x];
+            pos.y += c_dy[threadIdx.x];
+
+            if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+            {
+                map(pos.y, pos.x) = 2;
+
+                ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                s_st[ind] = pos;
+            }
+        }
+
+        __syncthreads();
+
+        while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+        {
+            const int subTaskIdx = threadIdx.x >> 3;
+            const int portion = ::min(s_counter, blockDim.x >> 3);
+
+            if (subTaskIdx < portion)
+                pos = s_st[s_counter - 1 - subTaskIdx];
+
+            __syncthreads();
+
+            if (threadIdx.x == 0)
+                s_counter -= portion;
+
+            __syncthreads();
+
+            if (subTaskIdx < portion)
+            {
+                pos.x += c_dx[threadIdx.x & 7];
+                pos.y += c_dy[threadIdx.x & 7];
+
+                if (pos.x > 0 && pos.x < map.cols && pos.y > 0 && pos.y < map.rows && map(pos.y, pos.x) == 1)
+                {
+                    map(pos.y, pos.x) = 2;
+
+                    ind = Emulation::smem::atomicAdd(&s_counter, 1);
+
+                    s_st[ind] = pos;
+                }
+            }
+
+            __syncthreads();
+        }
+
+        if (s_counter > 0)
+        {
+            if (threadIdx.x == 0)
+            {
+                ind = ::atomicAdd(&counter, s_counter);
+                s_ind = ind - s_counter;
+            }
+
+            __syncthreads();
+
+            ind = s_ind;
+
+            for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
+                st2[ind + i] = s_st[i];
+        }
+    }
+
+    void edgesHysteresisGlobal(PtrStepSzi map, ushort2* st1, ushort2* st2)
+    {
+        void* counter_ptr;
+        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
+
+        int count;
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+        while (count > 0)
+        {
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+            const dim3 block(128);
+            const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
+
+            edgesHysteresisGlobalKernel<<<grid, block>>>(map, st1, st2, count);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            std::swap(st1, st2);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+namespace canny
+{
+    struct GetEdges : unary_function<int, uchar>
+    {
+        __device__ __forceinline__ uchar operator ()(int e) const
+        {
+            return (uchar)(-(e >> 1));
+        }
+
+        __host__ __device__ __forceinline__ GetEdges() {}
+        __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace canny
+{
+    void getEdges(PtrStepSzi map, PtrStepSzb dst)
+    {
+        transform(map, dst, GetEdges(), WithOutMask(), 0);
+    }
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/clahe.cu
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/scan.hpp"
+#include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace clahe
+{
+    __global__ void calcLutKernel(const PtrStepb src, PtrStepb lut,
+                                  const int2 tileSize, const int tilesX,
+                                  const int clipLimit, const float lutScale)
+    {
+        __shared__ int smem[512];
+
+        const int tx = blockIdx.x;
+        const int ty = blockIdx.y;
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        smem[tid] = 0;
+        __syncthreads();
+
+        for (int i = threadIdx.y; i < tileSize.y; i += blockDim.y)
+        {
+            const uchar* srcPtr = src.ptr(ty * tileSize.y + i) + tx * tileSize.x;
+            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
+            {
+                const int data = srcPtr[j];
+                Emulation::smem::atomicAdd(&smem[data], 1);
+            }
+        }
+
+        __syncthreads();
+
+        int tHistVal = smem[tid];
+
+        __syncthreads();
+
+        if (clipLimit > 0)
+        {
+            // clip histogram bar
+
+            int clipped = 0;
+            if (tHistVal > clipLimit)
+            {
+                clipped = tHistVal - clipLimit;
+                tHistVal = clipLimit;
+            }
+
+            // find number of overall clipped samples
+
+            reduce<256>(smem, clipped, tid, plus<int>());
+
+            // broadcast evaluated value
+
+            __shared__ int totalClipped;
+
+            if (tid == 0)
+                totalClipped = clipped;
+            __syncthreads();
+
+            // redistribute clipped samples evenly
+
+            int redistBatch = totalClipped / 256;
+            tHistVal += redistBatch;
+
+            int residual = totalClipped - redistBatch * 256;
+            if (tid < residual)
+                ++tHistVal;
+        }
+
+        const int lutVal = blockScanInclusive<256>(tHistVal, smem, tid);
+
+        lut(ty * tilesX + tx, tid) = saturate_cast<uchar>(__float2int_rn(lutScale * lutVal));
+    }
+
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(tilesX, tilesY);
+
+        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
+
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    __global__ void tranformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
+    {
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x >= src.cols || y >= src.rows)
+            return;
+
+        const float tyf = (static_cast<float>(y) / tileSize.y) - 0.5f;
+        int ty1 = __float2int_rd(tyf);
+        int ty2 = ty1 + 1;
+        const float ya = tyf - ty1;
+        ty1 = ::max(ty1, 0);
+        ty2 = ::min(ty2, tilesY - 1);
+
+        const float txf = (static_cast<float>(x) / tileSize.x) - 0.5f;
+        int tx1 = __float2int_rd(txf);
+        int tx2 = tx1 + 1;
+        const float xa = txf - tx1;
+        tx1 = ::max(tx1, 0);
+        tx2 = ::min(tx2, tilesX - 1);
+
+        const int srcVal = src(y, x);
+
+        float res = 0;
+
+        res += lut(ty1 * tilesX + tx1, srcVal) * ((1.0f - xa) * (1.0f - ya));
+        res += lut(ty1 * tilesX + tx2, srcVal) * ((xa) * (1.0f - ya));
+        res += lut(ty2 * tilesX + tx1, srcVal) * ((1.0f - xa) * (ya));
+        res += lut(ty2 * tilesX + tx2, srcVal) * ((xa) * (ya));
+
+        dst(y, x) = saturate_cast<uchar>(res);
+    }
+
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(tranformKernel, cudaFuncCachePreferL1) );
+
+        tranformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+#endif // CUDA_DISABLER
--- a/modules/cudaimgproc/src/cuda/color.cu
+++ b/modules/cudaimgproc/src/cuda/color.cu
@@ -0,0 +1,461 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "cvt_color_internal.h"
+
+namespace cv { namespace cuda { namespace device
+{
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_x = 8 };
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+    {
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    OPENCV_GPU_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream) \
+    { \
+        traits::functor_type functor = traits::create_functor(); \
+        typedef typename traits::functor_type::argument_type src_t; \
+        typedef typename traits::functor_type::result_type   dst_t; \
+        cv::cuda::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
+    }
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(luv4_to_lbgra)
+
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F_FULL
+}}} // namespace cv { namespace cuda { namespace cudev
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/corners.cu
+++ b/modules/cudaimgproc/src/cuda/corners.cu
@@ -0,0 +1,280 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////
+
+        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(harrisDxTex, j, i);
+                        float dy = tex2D(harrisDyTex, j, i);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
+
+        template <typename BR, typename BC>
+        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    const int y = border_col.idx_row(i);
+
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        const int x = border_row.idx_col(j);
+
+                        float dx = tex2D(harrisDxTex, x, y);
+                        float dy = tex2D(harrisDyTex, x, y);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                dst(y, x) = a * c - b * b - k * (a + c) * (a + c);
+            }
+        }
+
+        void cornerHarris_gpu(int block_size, float k, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
+
+            bindTexture(&harrisDxTex, Dx);
+            bindTexture(&harrisDyTex, Dy);
+
+            switch (border_type)
+            {
+            case BORDER_REFLECT101:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                break;
+
+            case BORDER_REFLECT:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                break;
+
+            case BORDER_REPLICATE:
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////
+
+        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        float dx = tex2D(minEigenValDxTex, j, i);
+                        float dy = tex2D(minEigenValDyTex, j, i);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+
+                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
+            }
+        }
+
+
+        template <typename BR, typename BC>
+        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < dst.cols && y < dst.rows)
+            {
+                float a = 0.f;
+                float b = 0.f;
+                float c = 0.f;
+
+                const int ibegin = y - (block_size / 2);
+                const int jbegin = x - (block_size / 2);
+                const int iend = ibegin + block_size;
+                const int jend = jbegin + block_size;
+
+                for (int i = ibegin; i < iend; ++i)
+                {
+                    int y = border_col.idx_row(i);
+
+                    for (int j = jbegin; j < jend; ++j)
+                    {
+                        int x = border_row.idx_col(j);
+
+                        float dx = tex2D(minEigenValDxTex, x, y);
+                        float dy = tex2D(minEigenValDyTex, x, y);
+
+                        a += dx * dx;
+                        b += dx * dy;
+                        c += dy * dy;
+                    }
+                }
+
+                a *= 0.5f;
+                c *= 0.5f;
+
+                dst(y, x) = (a + c) - sqrtf((a - c) * (a - c) + b * b);
+            }
+        }
+
+        void cornerMinEigenVal_gpu(int block_size, PtrStepSzf Dx, PtrStepSzf Dy, PtrStepSzf dst, int border_type, cudaStream_t stream)
+        {
+            dim3 block(32, 8);
+            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
+
+            bindTexture(&minEigenValDxTex, Dx);
+            bindTexture(&minEigenValDyTex, Dy);
+
+            switch (border_type)
+            {
+            case BORDER_REFLECT101:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                break;
+
+            case BORDER_REFLECT:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                break;
+
+            case BORDER_REPLICATE:
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_CUDAFILTERS
+
+#endif // CUDA_DISABLER
--- a/modules/cudaimgproc/src/cuda/debayer.cu
+++ b/modules/cudaimgproc/src/cuda/debayer.cu
@@ -0,0 +1,544 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/limits.hpp"
+#include "opencv2/core/cuda/color.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    template <typename T> struct Bayer2BGR;
+
+    template <> struct Bayer2BGR<uchar>
+    {
+        uchar3 res0;
+        uchar3 res1;
+        uchar3 res2;
+        uchar3 res3;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            uchar4 patch[3][3];
+            patch[0][1] = ((const uchar4*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const uchar4*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const uchar4*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[1][1] = ((const uchar4*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const uchar4*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const uchar4*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            patch[2][1] = ((const uchar4*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const uchar4*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const uchar4*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 3) >> 2) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].w + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][1].z + patch[2][1].x + patch[2][1].z + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][1].z + patch[2][1].y + 2) >> 2;
+
+                const int t4 = (patch[0][1].z + patch[2][1].z + 1) >> 1;
+                const int t5 = (patch[1][1].y + patch[1][1].w + 1) >> 1;
+
+                const int t6 = (patch[0][1].z + patch[0][2].x + patch[2][1].z + patch[2][2].x + 2) >> 2;
+                const int t7 = (patch[0][1].w + patch[1][1].z + patch[1][2].x + patch[2][1].w + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+
+                    res2.x = t5;
+                    res2.y = patch[1][1].z;
+                    res2.z = t4;
+
+                    res3.x = patch[1][1].w;
+                    res3.y = t7;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+
+                    res2.x = t4;
+                    res2.y = patch[1][1].z;
+                    res2.z = t5;
+
+                    res3.x = t6;
+                    res3.y = t7;
+                    res3.z = patch[1][1].w;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].w + patch[0][1].y + patch[2][0].w + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].w + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][1].z + 1) >> 1;
+
+                const int t4 = (patch[0][1].y + patch[0][1].w + patch[2][1].y + patch[2][1].w + 2) >> 2;
+                const int t5 = (patch[0][1].z + patch[1][1].y + patch[1][1].w + patch[2][1].z + 2) >> 2;
+
+                const int t6 = (patch[0][1].w + patch[2][1].w + 1) >> 1;
+                const int t7 = (patch[1][1].z + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+
+                    res2.x = patch[1][1].z;
+                    res2.y = t5;
+                    res2.z = t4;
+
+                    res3.x = t7;
+                    res3.y = patch[1][1].w;
+                    res3.z = t6;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+
+                    res2.x = t4;
+                    res2.y = t5;
+                    res2.z = patch[1][1].z;
+
+                    res3.x = t6;
+                    res3.y = patch[1][1].w;
+                    res3.z = t7;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const uchar3& pix);
+    template <> __device__ __forceinline__ uchar toDst<uchar>(const uchar3& pix)
+    {
+        typename bgr_to_gray_traits<uchar>::functor_type f = bgr_to_gray_traits<uchar>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ uchar3 toDst<uchar3>(const uchar3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ uchar4 toDst<uchar4>(const uchar3& pix)
+    {
+        return make_uchar4(pix.x, pix.y, pix.z, 255);
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_8u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 2) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<uchar> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+        if (d_x + 2 < src.cols)
+            dst(d_y, d_x + 2) = toDst<D>(bayer.res2);
+        if (d_x + 3 < src.cols)
+            dst(d_y, d_x + 3) = toDst<D>(bayer.res3);
+    }
+
+    template <> struct Bayer2BGR<ushort>
+    {
+        ushort3 res0;
+        ushort3 res1;
+
+        __device__ void apply(const PtrStepSzb& src, int s_x, int s_y, bool blue_last, bool start_with_green)
+        {
+            ushort2 patch[3][3];
+            patch[0][1] = ((const ushort2*) src.ptr(s_y - 1))[s_x];
+            patch[0][0] = ((const ushort2*) src.ptr(s_y - 1))[::max(s_x - 1, 0)];
+            patch[0][2] = ((const ushort2*) src.ptr(s_y - 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[1][1] = ((const ushort2*) src.ptr(s_y))[s_x];
+            patch[1][0] = ((const ushort2*) src.ptr(s_y))[::max(s_x - 1, 0)];
+            patch[1][2] = ((const ushort2*) src.ptr(s_y))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            patch[2][1] = ((const ushort2*) src.ptr(s_y + 1))[s_x];
+            patch[2][0] = ((const ushort2*) src.ptr(s_y + 1))[::max(s_x - 1, 0)];
+            patch[2][2] = ((const ushort2*) src.ptr(s_y + 1))[::min(s_x + 1, ((src.cols + 1) >> 1) - 1)];
+
+            if ((s_y & 1) ^ start_with_green)
+            {
+                const int t0 = (patch[0][1].x + patch[2][1].x + 1) >> 1;
+                const int t1 = (patch[1][0].y + patch[1][1].y + 1) >> 1;
+
+                const int t2 = (patch[0][1].x + patch[0][2].x + patch[2][1].x + patch[2][2].x + 2) >> 2;
+                const int t3 = (patch[0][1].y + patch[1][1].x + patch[1][2].x + patch[2][1].y + 2) >> 2;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = t1;
+                    res0.y = patch[1][1].x;
+                    res0.z = t0;
+
+                    res1.x = patch[1][1].y;
+                    res1.y = t3;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = patch[1][1].x;
+                    res0.z = t1;
+
+                    res1.x = t2;
+                    res1.y = t3;
+                    res1.z = patch[1][1].y;
+                }
+            }
+            else
+            {
+                const int t0 = (patch[0][0].y + patch[0][1].y + patch[2][0].y + patch[2][1].y + 2) >> 2;
+                const int t1 = (patch[0][1].x + patch[1][0].y + patch[1][1].y + patch[2][1].x + 2) >> 2;
+
+                const int t2 = (patch[0][1].y + patch[2][1].y + 1) >> 1;
+                const int t3 = (patch[1][1].x + patch[1][2].x + 1) >> 1;
+
+                if ((s_y & 1) ^ blue_last)
+                {
+                    res0.x = patch[1][1].x;
+                    res0.y = t1;
+                    res0.z = t0;
+
+                    res1.x = t3;
+                    res1.y = patch[1][1].y;
+                    res1.z = t2;
+                }
+                else
+                {
+                    res0.x = t0;
+                    res0.y = t1;
+                    res0.z = patch[1][1].x;
+
+                    res1.x = t2;
+                    res1.y = patch[1][1].y;
+                    res1.z = t3;
+                }
+            }
+        }
+    };
+
+    template <typename D> __device__ __forceinline__ D toDst(const ushort3& pix);
+    template <> __device__ __forceinline__ ushort toDst<ushort>(const ushort3& pix)
+    {
+        typename bgr_to_gray_traits<ushort>::functor_type f = bgr_to_gray_traits<ushort>::create_functor();
+        return f(pix);
+    }
+    template <> __device__ __forceinline__ ushort3 toDst<ushort3>(const ushort3& pix)
+    {
+        return pix;
+    }
+    template <> __device__ __forceinline__ ushort4 toDst<ushort4>(const ushort3& pix)
+    {
+        return make_ushort4(pix.x, pix.y, pix.z, numeric_limits<ushort>::max());
+    }
+
+    template <typename D>
+    __global__ void Bayer2BGR_16u(const PtrStepSzb src, PtrStep<D> dst, const bool blue_last, const bool start_with_green)
+    {
+        const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
+        int s_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (s_y >= src.rows || (s_x << 1) >= src.cols)
+            return;
+
+        s_y = ::min(::max(s_y, 1), src.rows - 2);
+
+        Bayer2BGR<ushort> bayer;
+        bayer.apply(src, s_x, s_y, blue_last, start_with_green);
+
+        const int d_x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
+        const int d_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        dst(d_y, d_x) = toDst<D>(bayer.res0);
+        if (d_x + 1 < src.cols)
+            dst(d_y, d_x + 1) = toDst<D>(bayer.res1);
+    }
+
+    template <int cn>
+    void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 4 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template <int cn>
+    void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+    {
+        typedef typename TypeVec<ushort, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, 2 * block.x), divUp(src.rows, block.y));
+
+        cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
+
+        Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void Bayer2BGR_8u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    template void Bayer2BGR_16u_gpu<1>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+    template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+
+    //////////////////////////////////////////////////////////////
+    // Bayer Demosaicing (Malvar, He, and Cutler)
+    //
+    // by Morgan McGuire, Williams College
+    // http://graphics.cs.williams.edu/papers/BayerJGT09/#shaders
+    //
+    // ported to CUDA
+
+    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+    template <typename DstType>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    {
+        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
+        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
+        const float   kCx =  4.0f / 8.0f,     kCy =  6.0f / 8.0f,     kCz =  5.0f / 8.0f    /*kCw =  5.0f / 8.0f*/;
+        const float /*kDx =  0.0f / 8.0f,*/   kDy =  2.0f / 8.0f,     kDz = -1.0f / 8.0f    /*kDw = -1.0f / 8.0f*/;
+        const float   kEx = -1.0f / 8.0f,     kEy = -1.5f / 8.0f,   /*kEz = -1.0f / 8.0f,*/   kEw =  0.5f / 8.0f  ;
+        const float   kFx =  2.0f / 8.0f,   /*kFy =  0.0f / 8.0f,*/   kFz =  4.0f / 8.0f    /*kFw =  0.0f / 8.0f*/;
+
+        const int x = blockIdx.x * blockDim.x + threadIdx.x;
+        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+        if (x == 0 || x >= dst.cols - 1 || y == 0 || y >= dst.rows - 1)
+            return;
+
+        int2 center;
+        center.x = x + sourceOffset.x;
+        center.y = y + sourceOffset.y;
+
+        int4 xCoord;
+        xCoord.x = center.x - 2;
+        xCoord.y = center.x - 1;
+        xCoord.z = center.x + 1;
+        xCoord.w = center.x + 2;
+
+        int4 yCoord;
+        yCoord.x = center.y - 2;
+        yCoord.y = center.y - 1;
+        yCoord.z = center.y + 1;
+        yCoord.w = center.y + 2;
+
+        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+
+        float4 Dvec;
+        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
+        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
+        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
+        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+
+        float4 value;
+        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
+        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
+        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
+        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+
+        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
+        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
+        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
+        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
+        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+
+        float4 PATTERN;
+        PATTERN.x = kCx * C;
+        PATTERN.y = kCy * C;
+        PATTERN.z = kCz * C;
+        PATTERN.w = PATTERN.z;
+
+        float D = Dvec.x + Dvec.y + Dvec.z + Dvec.w;
+
+        // There are five filter patterns (identity, cross, checker,
+        // theta, phi). Precompute the terms from all of them and then
+        // use swizzles to assign to color channels.
+        //
+        // Channel Matches
+        // x cross (e.g., EE G)
+        // y checker (e.g., EE B)
+        // z theta (e.g., EO R)
+        // w phi (e.g., EO B)
+
+        #define A value.x  // A0 + A1
+        #define B value.y  // B0 + B1
+        #define E value.z  // E0 + E1
+        #define F value.w  // F0 + F1
+
+        float3 temp;
+
+        // PATTERN.yzw += (kD.yz * D).xyy;
+        temp.x = kDy * D;
+        temp.y = kDz * D;
+        PATTERN.y += temp.x;
+        PATTERN.z += temp.y;
+        PATTERN.w += temp.y;
+
+        // PATTERN += (kA.xyz * A).xyzx;
+        temp.x = kAx * A;
+        temp.y = kAy * A;
+        temp.z = kAz * A;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.z;
+        PATTERN.w += temp.x;
+
+        // PATTERN += (kE.xyw * E).xyxz;
+        temp.x = kEx * E;
+        temp.y = kEy * E;
+        temp.z = kEw * E;
+        PATTERN.x += temp.x;
+        PATTERN.y += temp.y;
+        PATTERN.z += temp.x;
+        PATTERN.w += temp.z;
+
+        // PATTERN.xw += kB.xw * B;
+        PATTERN.x += kBx * B;
+        PATTERN.w += kBw * B;
+
+        // PATTERN.xz += kF.xz * F;
+        PATTERN.x += kFx * F;
+        PATTERN.z += kFz * F;
+
+        // Determine which of four types of pixels we are on.
+        int2 alternate;
+        alternate.x = (x + firstRed.x) % 2;
+        alternate.y = (y + firstRed.y) % 2;
+
+        // in BGR sequence;
+        uchar3 pixelColor =
+            (alternate.y == 0) ?
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.y), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(C)) :
+                    make_uchar3(saturate_cast<uchar>(PATTERN.w), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.z))) :
+                ((alternate.x == 0) ?
+                    make_uchar3(saturate_cast<uchar>(PATTERN.z), saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.w)) :
+                    make_uchar3(saturate_cast<uchar>(C), saturate_cast<uchar>(PATTERN.x), saturate_cast<uchar>(PATTERN.y)));
+
+        dst(y, x) = toDst<DstType>(pixelColor);
+    }
+
+    template <int cn>
+    void MHCdemosaic(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream)
+    {
+        typedef typename TypeVec<uchar, cn>::vec_type dst_t;
+
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+
+        bindTexture(&sourceTex, src);
+
+        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+
+    template void MHCdemosaic<1>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<3>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+    template void MHCdemosaic<4>(PtrStepSzb src, int2 sourceOffset, PtrStepSzb dst, int2 firstRed, cudaStream_t stream);
+}}}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/generalized_hough.cu
+++ b/modules/cudaimgproc/src/cuda/generalized_hough.cu
@@ -0,0 +1,824 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/transform.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAARITHM
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ght
+    {
+        __device__ int g_counter;
+
+        template <typename T, int PIXELS_PER_THREAD>
+        __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy,
+                                           unsigned int* coordList, float* thetaList)
+        {
+            __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
+            __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
+            __shared__ int s_sizes[4];
+            __shared__ int s_globStart[4];
+
+            const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (threadIdx.x == 0)
+                s_sizes[threadIdx.y] = 0;
+            __syncthreads();
+
+            if (y < edges.rows)
+            {
+                // fill the queue
+                const uchar* edgesRow = edges.ptr(y);
+                const T* dxRow = dx.ptr(y);
+                const T* dyRow = dy.ptr(y);
+
+                for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
+                {
+                    const T dxVal = dxRow[xx];
+                    const T dyVal = dyRow[xx];
+
+                    if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
+                    {
+                        const unsigned int coord = (y << 16) | xx;
+
+                        float theta = ::atan2f(dyVal, dxVal);
+                        if (theta < 0)
+                            theta += 2.0f * CV_PI_F;
+
+                        const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
+
+                        s_coordLists[threadIdx.y][qidx] = coord;
+                        s_thetaLists[threadIdx.y][qidx] = theta;
+                    }
+                }
+            }
+
+            __syncthreads();
+
+            // let one thread reserve the space required in the global list
+            if (threadIdx.x == 0 && threadIdx.y == 0)
+            {
+                // find how many items are stored in each list
+                int totalSize = 0;
+                for (int i = 0; i < blockDim.y; ++i)
+                {
+                    s_globStart[i] = totalSize;
+                    totalSize += s_sizes[i];
+                }
+
+                // calculate the offset in the global list
+                const int globalOffset = atomicAdd(&g_counter, totalSize);
+                for (int i = 0; i < blockDim.y; ++i)
+                    s_globStart[i] += globalOffset;
+            }
+
+            __syncthreads();
+
+            // copy local queues to global queue
+            const int qsize = s_sizes[threadIdx.y];
+            int gidx = s_globStart[threadIdx.y] + threadIdx.x;
+            for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
+            {
+                coordList[gidx] = s_coordLists[threadIdx.y][i];
+                thetaList[gidx] = s_thetaLists[threadIdx.y][i];
+            }
+        }
+
+        template <typename T>
+        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
+        {
+            const int PIXELS_PER_THREAD = 8;
+
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 4);
+            const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
+
+            buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+
+        __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                    PtrStep<short2> r_table, int* r_sizes, int maxSize,
+                                    const short2 templCenter, const float thetaScale)
+        {
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= pointsCount)
+                return;
+
+            const unsigned int coord = coordList[tid];
+            short2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[tid];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const int ind = ::atomicAdd(r_sizes + n, 1);
+            if (ind < maxSize)
+                r_table(n, ind) = saturate_cast<short2>(p - templCenter);
+        }
+
+        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                             PtrStepSz<short2> r_table, int* r_sizes,
+                             short2 templCenter, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(pointsCount, block.x));
+
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Ballard_Pos
+
+        __global__ void Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                             const PtrStep<short2> r_table, const int* r_sizes,
+                                             PtrStepSzi hist,
+                                             const float idp, const float thetaScale)
+        {
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= pointsCount)
+                return;
+
+            const unsigned int coord = coordList[tid];
+            short2 p;
+            p.x = (coord & 0xFFFF);
+            p.y = (coord >> 16) & 0xFFFF;
+
+            const float theta = thetaList[tid];
+            const int n = __float2int_rn(theta * thetaScale);
+
+            const short2* r_row = r_table.ptr(n);
+            const int r_row_size = r_sizes[n];
+
+            for (int j = 0; j < r_row_size; ++j)
+            {
+                short2 c = saturate_cast<short2>(p - r_row[j]);
+
+                c.x = __float2int_rn(c.x * idp);
+                c.y = __float2int_rn(c.y * idp);
+
+                if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
+                    ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
+            }
+        }
+
+        void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                      PtrStepSz<short2> r_table, const int* r_sizes,
+                                      PtrStepSzi hist,
+                                      float dp, int levels)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(pointsCount, block.x));
+
+            const float idp = 1.0f / dp;
+            const float thetaScale = levels / (2.0f * CV_PI_F);
+
+            Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes,
+                                                  const int maxSize, const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= hist.cols - 2 || y >= hist.rows - 2)
+                return;
+
+            const int curVotes = hist(y + 1, x + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  hist(y + 1, x) &&
+                curVotes >= hist(y + 1, x + 2) &&
+                curVotes >  hist(y, x + 1) &&
+                curVotes >= hist(y + 2, x + 1))
+            {
+                const int ind = ::atomicAdd(&g_counter, 1);
+
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
+                    votes[ind] = make_int3(curVotes, 0, 0);
+                }
+            }
+        }
+
+        int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
+
+            Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Guil_Full
+
+        struct FeatureTable
+        {
+            uchar* p1_pos_data;
+            size_t p1_pos_step;
+
+            uchar* p1_theta_data;
+            size_t p1_theta_step;
+
+            uchar* p2_pos_data;
+            size_t p2_pos_step;
+
+            uchar* d12_data;
+            size_t d12_step;
+
+            uchar* r1_data;
+            size_t r1_step;
+
+            uchar* r2_data;
+            size_t r2_step;
+        };
+
+        __constant__ FeatureTable c_templFeatures;
+        __constant__ FeatureTable c_imageFeatures;
+
+        void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
+        {
+            FeatureTable tbl;
+
+            tbl.p1_pos_data = p1_pos.data;
+            tbl.p1_pos_step = p1_pos.step;
+
+            tbl.p1_theta_data = p1_theta.data;
+            tbl.p1_theta_step = p1_theta.step;
+
+            tbl.p2_pos_data = p2_pos.data;
+            tbl.p2_pos_step = p2_pos.step;
+
+            tbl.d12_data = d12.data;
+            tbl.d12_step = d12.step;
+
+            tbl.r1_data = r1.data;
+            tbl.r1_step = r1.step;
+
+            tbl.r2_data = r2.data;
+            tbl.r2_step = r2.step;
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
+        }
+        void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
+        {
+            FeatureTable tbl;
+
+            tbl.p1_pos_data = p1_pos.data;
+            tbl.p1_pos_step = p1_pos.step;
+
+            tbl.p1_theta_data = p1_theta.data;
+            tbl.p1_theta_step = p1_theta.step;
+
+            tbl.p2_pos_data = p2_pos.data;
+            tbl.p2_pos_step = p2_pos.step;
+
+            tbl.d12_data = d12.data;
+            tbl.d12_step = d12.step;
+
+            tbl.r1_data = r1.data;
+            tbl.r1_step = r1.step;
+
+            tbl.r2_data = r2.data;
+            tbl.r2_step = r2.step;
+
+            cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
+        }
+
+        struct TemplFeatureTable
+        {
+            static __device__ float2* p1_pos(int n)
+            {
+                return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
+            }
+            static __device__ float* p1_theta(int n)
+            {
+                return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
+            }
+            static __device__ float2* p2_pos(int n)
+            {
+                return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
+            }
+
+            static __device__ float* d12(int n)
+            {
+                return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
+            }
+
+            static __device__ float2* r1(int n)
+            {
+                return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
+            }
+            static __device__ float2* r2(int n)
+            {
+                return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
+            }
+        };
+        struct ImageFeatureTable
+        {
+            static __device__ float2* p1_pos(int n)
+            {
+                return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
+            }
+            static __device__ float* p1_theta(int n)
+            {
+                return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
+            }
+            static __device__ float2* p2_pos(int n)
+            {
+                return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
+            }
+
+            static __device__ float* d12(int n)
+            {
+                return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
+            }
+
+            static __device__ float2* r1(int n)
+            {
+                return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
+            }
+            static __device__ float2* r2(int n)
+            {
+                return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
+            }
+        };
+
+        __device__ float clampAngle(float a)
+        {
+            float res = a;
+
+            while (res > 2.0f * CV_PI_F)
+                res -= 2.0f * CV_PI_F;
+            while (res < 0.0f)
+                res += 2.0f * CV_PI_F;
+
+            return res;
+        }
+
+        __device__ bool angleEq(float a, float b, float eps)
+        {
+            return (::fabs(clampAngle(a - b)) <= eps);
+        }
+
+        template <class FT, bool isTempl>
+        __global__ void Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
+                                                   int* sizes, const int maxSize,
+                                                   const float xi, const float angleEpsilon, const float alphaScale,
+                                                   const float2 center, const float maxDist)
+        {
+            const float p1_theta = thetaList[blockIdx.x];
+            const unsigned int coord1 = coordList[blockIdx.x];
+            float2 p1_pos;
+            p1_pos.x = (coord1 & 0xFFFF);
+            p1_pos.y = (coord1 >> 16) & 0xFFFF;
+
+            for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
+            {
+                const float p2_theta = thetaList[i];
+                const unsigned int coord2 = coordList[i];
+                float2 p2_pos;
+                p2_pos.x = (coord2 & 0xFFFF);
+                p2_pos.y = (coord2 >> 16) & 0xFFFF;
+
+                if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
+                {
+                    const float2 d = p1_pos - p2_pos;
+
+                    float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
+                    float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
+
+                    if (d12 > maxDist)
+                        continue;
+
+                    float2 r1 = p1_pos - center;
+                    float2 r2 = p2_pos - center;
+
+                    const int n = __float2int_rn(alpha12 * alphaScale);
+
+                    const int ind = ::atomicAdd(sizes + n, 1);
+
+                    if (ind < maxSize)
+                    {
+                        if (!isTempl)
+                        {
+                            FT::p1_pos(n)[ind] = p1_pos;
+                            FT::p2_pos(n)[ind] = p2_pos;
+                        }
+
+                        FT::p1_theta(n)[ind] = p1_theta;
+
+                        FT::d12(n)[ind] = d12;
+
+                        if (isTempl)
+                        {
+                            FT::r1(n)[ind] = r1;
+                            FT::r2(n)[ind] = r2;
+                        }
+                    }
+                }
+            }
+        }
+
+        template <class FT, bool isTempl>
+        void Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                               int* sizes, int maxSize,
+                                               float xi, float angleEpsilon, int levels,
+                                               float2 center, float maxDist)
+        {
+            const dim3 block(256);
+            const dim3 grid(pointsCount);
+
+            const float alphaScale = levels / (2.0f * CV_PI_F);
+
+            Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
+                                                                     sizes, maxSize,
+                                                                     xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
+                                                                     center, maxDist);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            thrust::device_ptr<int> sizesPtr(sizes);
+            thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
+        }
+
+        void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist)
+        {
+            Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
+                                                                       sizes, maxSize,
+                                                                       xi, angleEpsilon, levels,
+                                                                       center, maxDist);
+        }
+        void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist)
+        {
+            Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
+                                                                        sizes, maxSize,
+                                                                        xi, angleEpsilon, levels,
+                                                                        center, maxDist);
+        }
+
+        __global__ void Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
+                                            const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
+        {
+            extern __shared__ int s_OHist[];
+            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
+                s_OHist[i] = 0;
+            __syncthreads();
+
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+
+                    const float angle = clampAngle(im_p1_theta - t_p1_theta);
+
+                    if (angle >= minAngle && angle <= maxAngle)
+                    {
+                        const int n = __float2int_rn((angle - minAngle) * iAngleStep);
+                        Emulation::smem::atomicAdd(&s_OHist[n], 1);
+                    }
+                }
+            }
+            __syncthreads();
+
+            for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
+                ::atomicAdd(OHist + i, s_OHist[i]);
+        }
+
+        void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
+                                     float minAngle, float maxAngle, float angleStep, int angleRange,
+                                     int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            minAngle *= (CV_PI_F / 180.0f);
+            maxAngle *= (CV_PI_F / 180.0f);
+            angleStep *= (CV_PI_F / 180.0f);
+
+            const size_t smemSize = (angleRange + 1) * sizeof(float);
+
+            Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
+                                                           minAngle, maxAngle, 1.0f / angleStep, angleRange);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
+                                            const float angle, const float angleEpsilon,
+                                            const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
+        {
+            extern __shared__ int s_SHist[];
+            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
+                s_SHist[i] = 0;
+            __syncthreads();
+
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
+                const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+                    const float im_d12 = ImageFeatureTable::d12(level)[i];
+
+                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
+                    {
+                        const float scale = im_d12 / t_d12;
+
+                        if (scale >= minScale && scale <= maxScale)
+                        {
+                            const int s = __float2int_rn((scale - minScale) * iScaleStep);
+                            Emulation::smem::atomicAdd(&s_SHist[s], 1);
+                        }
+                    }
+                }
+            }
+            __syncthreads();
+
+            for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
+                ::atomicAdd(SHist + i, s_SHist[i]);
+        }
+
+        void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
+                                     float angle, float angleEpsilon,
+                                     float minScale, float maxScale, float iScaleStep, int scaleRange,
+                                     int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            angle *= (CV_PI_F / 180.0f);
+            angleEpsilon *= (CV_PI_F / 180.0f);
+
+            const size_t smemSize = (scaleRange + 1) * sizeof(float);
+
+            Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
+                                                           angle, angleEpsilon,
+                                                           minScale, maxScale, iScaleStep, scaleRange);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                            const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
+                                            const float idp)
+        {
+            const int tIdx = blockIdx.x;
+            const int level = blockIdx.y;
+
+            const int tSize = templSizes[level];
+
+            if (tIdx < tSize)
+            {
+                const int imSize = imageSizes[level];
+
+                const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
+
+                float2 r1 = TemplFeatureTable::r1(level)[tIdx];
+                float2 r2 = TemplFeatureTable::r2(level)[tIdx];
+
+                r1 = r1 * scale;
+                r2 = r2 * scale;
+
+                r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
+                r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
+
+                for (int i = threadIdx.x; i < imSize; i += blockDim.x)
+                {
+                    const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
+
+                    const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
+                    const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
+
+                    if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
+                    {
+                        float2 c1, c2;
+
+                        c1 = im_p1_pos - r1;
+                        c1 = c1 * idp;
+
+                        c2 = im_p2_pos - r2;
+                        c2 = c2 * idp;
+
+                        if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
+                            continue;
+
+                        if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
+                            ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
+                    }
+                }
+            }
+        }
+
+        void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                     float angle, float angleEpsilon, float scale,
+                                     float dp,
+                                     int levels, int tMaxSize)
+        {
+            const dim3 block(256);
+            const dim3 grid(tMaxSize, levels + 1);
+
+            angle *= (CV_PI_F / 180.0f);
+            angleEpsilon *= (CV_PI_F / 180.0f);
+
+            const float sinVal = ::sinf(angle);
+            const float cosVal = ::cosf(angle);
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_calcPHist, cudaFuncCachePreferL1) );
+
+            Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
+                                                 angle, sinVal, cosVal, angleEpsilon, scale,
+                                                 1.0f / dp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
+                                                const float angle, const int angleVotes, const float scale, const int scaleVotes,
+                                                const float dp, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= hist.cols - 2 || y >= hist.rows - 2)
+                return;
+
+            const int curVotes = hist(y + 1, x + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  hist(y + 1, x) &&
+                curVotes >= hist(y + 1, x + 2) &&
+                curVotes >  hist(y, x + 1) &&
+                curVotes >= hist(y + 2, x + 1))
+            {
+                const int ind = ::atomicAdd(&g_counter, 1);
+
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float4(x * dp, y * dp, scale, angle);
+                    votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
+                }
+            }
+        }
+
+        int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
+                                        float angle, int angleVotes, float scale, int scaleVotes,
+                                        float dp, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
+
+            Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
+                                                     angle, angleVotes, scale, scaleVotes,
+                                                     dp, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_CUDAARITHM
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/gftt.cu
+++ b/modules/cudaimgproc/src/cuda/gftt.cu
@@ -0,0 +1,143 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/utility.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace gfft
+    {
+        texture<float, cudaTextureType2D, cudaReadModeElementType> eigTex(0, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __device__ int g_counter = 0;
+
+        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols)
+        {
+            const int j = blockIdx.x * blockDim.x + threadIdx.x;
+            const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
+            {
+                float val = tex2D(eigTex, j, i);
+
+                if (val > threshold)
+                {
+                    float maxVal = val;
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i - 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i - 1), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i), maxVal);
+
+                    maxVal = ::fmax(tex2D(eigTex, j - 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j    , i + 1), maxVal);
+                    maxVal = ::fmax(tex2D(eigTex, j + 1, i + 1), maxVal);
+
+                    if (val == maxVal)
+                    {
+                        const int ind = ::atomicAdd(&g_counter, 1);
+
+                        if (ind < max_count)
+                            corners[ind] = make_float2(j, i);
+                    }
+                }
+            }
+        }
+
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
+        {
+            void* counter_ptr;
+            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(int)) );
+
+            bindTexture(&eigTex, eig);
+
+            dim3 block(16, 16);
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));
+
+            if (mask.data)
+                findCorners<<<grid, block>>>(threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols);
+            else
+                findCorners<<<grid, block>>>(threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int count;
+            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return std::min(count, max_count);
+        }
+
+        class EigGreater
+        {
+        public:
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
+            {
+                return tex2D(eigTex, a.x, a.y) > tex2D(eigTex, b.x, b.y);
+            }
+        };
+
+
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
+        {
+            bindTexture(&eigTex, eig);
+
+            thrust::device_ptr<float2> ptr(corners);
+
+            thrust::sort(ptr, ptr + count, EigGreater());
+        }
+    } // namespace optical_flow
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@@ -0,0 +1,233 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/functional.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/transform.hpp"
+
+using namespace cv::cuda;
+using namespace cv::cuda::device;
+
+namespace hist
+{
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    {
+        __shared__ int shist[256];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        shist[tid] = 0;
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                unsigned int data = rowPtr[x];
+
+                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    unsigned int data = ((const uchar*)rowPtr)[x];
+                    Emulation::smem::atomicAdd(&shist[data], 1);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        const int histVal = shist[tid];
+        if (histVal > 0)
+            ::atomicAdd(hist + tid, histVal);
+    }
+
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel)
+    {
+        if (data >= lowerLevel && data <= upperLevel)
+        {
+            const uint ind = (data - lowerLevel) / binSize;
+            Emulation::smem::atomicAdd(shist + ind, 1);
+        }
+    }
+
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
+                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    {
+        extern __shared__ int shist[];
+
+        const int y = blockIdx.x * blockDim.y + threadIdx.y;
+        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+
+        if (tid < binCount)
+            shist[tid] = 0;
+
+        __syncthreads();
+
+        if (y < rows)
+        {
+            const uchar* rowPtr = src + y * step;
+            const uint* rowPtr4 = (uint*) rowPtr;
+
+            const int cols_4 = cols / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
+            {
+                const uint data = rowPtr4[x];
+
+                histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
+            }
+
+            if (cols % 4 != 0 && threadIdx.x == 0)
+            {
+                for (int x = cols_4 * 4; x < cols; ++x)
+                {
+                    const uchar data = rowPtr[x];
+                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
+                }
+            }
+        }
+
+        __syncthreads();
+
+        if (tid < binCount)
+        {
+            const int histVal = shist[tid];
+
+            if (histVal > 0)
+                ::atomicAdd(hist + tid, histVal);
+        }
+    }
+
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    {
+        const dim3 block(32, 8);
+        const dim3 grid(divUp(src.rows, block.y));
+
+        const int binSize = divUp(upperLevel - lowerLevel, binCount);
+
+        const size_t smem_size = binCount * sizeof(int);
+
+        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////
+
+namespace hist
+{
+    __constant__ int c_lut[256];
+
+    struct EqualizeHist : unary_function<uchar, uchar>
+    {
+        float scale;
+
+        __host__ EqualizeHist(float _scale) : scale(_scale) {}
+
+        __device__ __forceinline__ uchar operator ()(uchar val) const
+        {
+            const int lut = c_lut[val];
+            return __float2int_rn(scale * lut);
+        }
+    };
+}
+
+namespace cv { namespace cuda { namespace device
+{
+    template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
+    {
+        enum { smart_shift = 4 };
+    };
+}}}
+
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
+    {
+        if (stream == 0)
+            cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+        else
+            cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
+
+        const float scale = 255.0f / (src.cols * src.rows);
+
+        device::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
+    }
+}
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/hough_circles.cu
+++ b/modules/cudaimgproc/src/cuda/hough_circles.cu
@@ -0,0 +1,260 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough_circles
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumCenters
+
+        __global__ void circlesAccumCenters(const unsigned int* list, const int count, const PtrStepi dx, const PtrStepi dy,
+                                            PtrStepi accum, const int width, const int height, const int minRadius, const int maxRadius, const float idp)
+        {
+            const int SHIFT = 10;
+            const int ONE = 1 << SHIFT;
+
+            const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+            if (tid >= count)
+                return;
+
+            const unsigned int val = list[tid];
+
+            const int x = (val & 0xFFFF);
+            const int y = (val >> 16) & 0xFFFF;
+
+            const int vx = dx(y, x);
+            const int vy = dy(y, x);
+
+            if (vx == 0 && vy == 0)
+                return;
+
+            const float mag = ::sqrtf(vx * vx + vy * vy);
+
+            const int x0 = __float2int_rn((x * idp) * ONE);
+            const int y0 = __float2int_rn((y * idp) * ONE);
+
+            int sx = __float2int_rn((vx * idp) * ONE / mag);
+            int sy = __float2int_rn((vy * idp) * ONE / mag);
+
+            // Step from minRadius to maxRadius in both directions of the gradient
+            for (int k1 = 0; k1 < 2; ++k1)
+            {
+                int x1 = x0 + minRadius * sx;
+                int y1 = y0 + minRadius * sy;
+
+                for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r)
+                {
+                    const int x2 = x1 >> SHIFT;
+                    const int y2 = y1 >> SHIFT;
+
+                    if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height)
+                        break;
+
+                    ::atomicAdd(accum.ptr(y2 + 1) + x2 + 1, 1);
+                }
+
+                sx = -sx;
+                sy = -sy;
+            }
+        }
+
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
+        {
+            const dim3 block(256);
+            const dim3 grid(divUp(count, block.x));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(circlesAccumCenters, cudaFuncCachePreferL1) );
+
+            circlesAccumCenters<<<grid, block>>>(list, count, dx, dy, accum, accum.cols - 2, accum.rows - 2, minRadius, maxRadius, idp);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // buildCentersList
+
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < accum.cols - 2 && y < accum.rows - 2)
+            {
+                const int top = accum(y, x + 1);
+
+                const int left = accum(y + 1, x);
+                const int cur = accum(y + 1, x + 1);
+                const int right = accum(y + 1, x + 2);
+
+                const int bottom = accum(y + 2, x + 1);
+
+                if (cur > threshold && cur > top && cur >= bottom && cur >  left && cur >= right)
+                {
+                    const unsigned int val = (y << 16) | x;
+                    const int idx = ::atomicAdd(&g_counter, 1);
+                    centers[idx] = val;
+                }
+            }
+        }
+
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(buildCentersList, cudaFuncCachePreferL1) );
+
+            buildCentersList<<<grid, block>>>(accum, centers, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            return totalCount;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // circlesAccumRadius
+
+        __global__ void circlesAccumRadius(const unsigned int* centers, const unsigned int* list, const int count,
+                                           float3* circles, const int maxCircles, const float dp,
+                                           const int minRadius, const int maxRadius, const int histSize, const int threshold)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < histSize + 2; i += blockDim.x)
+                smem[i] = 0;
+            __syncthreads();
+
+            unsigned int val = centers[blockIdx.x];
+
+            float cx = (val & 0xFFFF);
+            float cy = (val >> 16) & 0xFFFF;
+
+            cx = (cx + 0.5f) * dp;
+            cy = (cy + 0.5f) * dp;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                const float rad = ::sqrtf((cx - x) * (cx - x) + (cy - y) * (cy - y));
+                if (rad >= minRadius && rad <= maxRadius)
+                {
+                    const int r = __float2int_rn(rad - minRadius);
+
+                    Emulation::smem::atomicAdd(&smem[r + 1], 1);
+                }
+            }
+
+            __syncthreads();
+
+            for (int i = threadIdx.x; i < histSize; i += blockDim.x)
+            {
+                const int curVotes = smem[i + 1];
+
+                if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2])
+                {
+                    const int ind = ::atomicAdd(&g_counter, 1);
+                    if (ind < maxCircles)
+                        circles[ind] = make_float3(cx, cy, i + minRadius);
+                }
+            }
+        }
+
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(centersCount);
+
+            const int histSize = maxRadius - minRadius + 1;
+            size_t smemSize = (histSize + 2) * sizeof(int);
+
+            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxCircles);
+
+            return totalCount;
+        }
+    }
+}}}
+
+#endif // HAVE_OPENCV_CUDAFILTERS
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/hough_lines.cu
+++ b/modules/cudaimgproc/src/cuda/hough_lines.cu
@@ -0,0 +1,212 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include <thrust/device_ptr.h>
+#include <thrust/sort.h>
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/emulation.hpp"
+#include "opencv2/core/cuda/dynamic_smem.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough_lines
+    {
+        __device__ int g_counter;
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesAccum
+
+        __global__ void linesAccumGlobal(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                ::atomicAdd(accumRow + r + 1, 1);
+            }
+        }
+
+        __global__ void linesAccumShared(const unsigned int* list, const int count, PtrStepi accum, const float irho, const float theta, const int numrho)
+        {
+            int* smem = DynamicSharedMem<int>();
+
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                smem[i] = 0;
+
+            __syncthreads();
+
+            const int n = blockIdx.x;
+            const float ang = n * theta;
+
+            float sinVal;
+            float cosVal;
+            sincosf(ang, &sinVal, &cosVal);
+            sinVal *= irho;
+            cosVal *= irho;
+
+            const int shift = (numrho - 1) / 2;
+
+            for (int i = threadIdx.x; i < count; i += blockDim.x)
+            {
+                const unsigned int val = list[i];
+
+                const int x = (val & 0xFFFF);
+                const int y = (val >> 16) & 0xFFFF;
+
+                int r = __float2int_rn(x * cosVal + y * sinVal);
+                r += shift;
+
+                Emulation::smem::atomicAdd(&smem[r + 1], 1);
+            }
+
+            __syncthreads();
+
+            int* accumRow = accum.ptr(n + 1);
+            for (int i = threadIdx.x; i < numrho + 1; i += blockDim.x)
+                accumRow[i] = smem[i];
+        }
+
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        {
+            const dim3 block(has20 ? 1024 : 512);
+            const dim3 grid(accum.rows - 2);
+
+            size_t smemSize = (accum.cols - 1) * sizeof(int);
+
+            if (smemSize < sharedMemPerBlock - 1000)
+                linesAccumShared<<<grid, block, smemSize>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+            else
+                linesAccumGlobal<<<grid, block>>>(list, count, accum, 1.0f / rho, theta, accum.cols - 2);
+
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // linesGetResult
+
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes > threshold &&
+                curVotes >  accum(n + 1, r) &&
+                curVotes >= accum(n + 1, r + 2) &&
+                curVotes >  accum(n, r + 1) &&
+                curVotes >= accum(n + 2, r + 1))
+            {
+                const float radius = (r - (numrho - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                const int ind = ::atomicAdd(&g_counter, 1);
+                if (ind < maxSize)
+                {
+                    out[ind] = make_float2(radius, angle);
+                    votes[ind] = curVotes;
+                }
+            }
+        }
+
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            cudaSafeCall( cudaFuncSetCacheConfig(linesGetResult, cudaFuncCachePreferL1) );
+
+            linesGetResult<<<grid, block>>>(accum, out, votes, maxSize, rho, theta, threshold, accum.cols - 2);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            if (doSort && totalCount > 0)
+            {
+                thrust::device_ptr<float2> outPtr(out);
+                thrust::device_ptr<int> votesPtr(votes);
+                thrust::sort_by_key(votesPtr, votesPtr + totalCount, outPtr, thrust::greater<int>());
+            }
+
+            return totalCount;
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/hough_segments.cu
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@@ -0,0 +1,249 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough_segments
+    {
+        __device__ int g_counter;
+
+        texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_mask(false, cudaFilterModePoint, cudaAddressModeClamp);
+
+        __global__ void houghLinesProbabilistic(const PtrStepSzi accum,
+                                                int4* out, const int maxSize,
+                                                const float rho, const float theta,
+                                                const int lineGap, const int lineLength,
+                                                const int rows, const int cols)
+        {
+            const int r = blockIdx.x * blockDim.x + threadIdx.x;
+            const int n = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (r >= accum.cols - 2 || n >= accum.rows - 2)
+                return;
+
+            const int curVotes = accum(n + 1, r + 1);
+
+            if (curVotes >= lineLength &&
+                curVotes > accum(n, r) &&
+                curVotes > accum(n, r + 1) &&
+                curVotes > accum(n, r + 2) &&
+                curVotes > accum(n + 1, r) &&
+                curVotes > accum(n + 1, r + 2) &&
+                curVotes > accum(n + 2, r) &&
+                curVotes > accum(n + 2, r + 1) &&
+                curVotes > accum(n + 2, r + 2))
+            {
+                const float radius = (r - (accum.cols - 2 - 1) * 0.5f) * rho;
+                const float angle = n * theta;
+
+                float cosa;
+                float sina;
+                sincosf(angle, &sina, &cosa);
+
+                float2 p0 = make_float2(cosa * radius, sina * radius);
+                float2 dir = make_float2(-sina, cosa);
+
+                float2 pb[4] = {make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1), make_float2(-1, -1)};
+                float a;
+
+                if (dir.x != 0)
+                {
+                    a = -p0.x / dir.x;
+                    pb[0].x = 0;
+                    pb[0].y = p0.y + a * dir.y;
+
+                    a = (cols - 1 - p0.x) / dir.x;
+                    pb[1].x = cols - 1;
+                    pb[1].y = p0.y + a * dir.y;
+                }
+                if (dir.y != 0)
+                {
+                    a = -p0.y / dir.y;
+                    pb[2].x = p0.x + a * dir.x;
+                    pb[2].y = 0;
+
+                    a = (rows - 1 - p0.y) / dir.y;
+                    pb[3].x = p0.x + a * dir.x;
+                    pb[3].y = rows - 1;
+                }
+
+                if (pb[0].x == 0 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[0];
+                    if (dir.x < 0)
+                        dir = -dir;
+                }
+                else if (pb[1].x == cols - 1 && (pb[0].y >= 0 && pb[0].y < rows))
+                {
+                    p0 = pb[1];
+                    if (dir.x > 0)
+                        dir = -dir;
+                }
+                else if (pb[2].y == 0 && (pb[2].x >= 0 && pb[2].x < cols))
+                {
+                    p0 = pb[2];
+                    if (dir.y < 0)
+                        dir = -dir;
+                }
+                else if (pb[3].y == rows - 1 && (pb[3].x >= 0 && pb[3].x < cols))
+                {
+                    p0 = pb[3];
+                    if (dir.y > 0)
+                        dir = -dir;
+                }
+
+                float2 d;
+                if (::fabsf(dir.x) > ::fabsf(dir.y))
+                {
+                    d.x = dir.x > 0 ? 1 : -1;
+                    d.y = dir.y / ::fabsf(dir.x);
+                }
+                else
+                {
+                    d.x = dir.x / ::fabsf(dir.y);
+                    d.y = dir.y > 0 ? 1 : -1;
+                }
+
+                float2 line_end[2];
+                int gap;
+                bool inLine = false;
+
+                float2 p1 = p0;
+                if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    return;
+
+                for (;;)
+                {
+                    if (tex2D(tex_mask, p1.x, p1.y))
+                    {
+                        gap = 0;
+
+                        if (!inLine)
+                        {
+                            line_end[0] = p1;
+                            line_end[1] = p1;
+                            inLine = true;
+                        }
+                        else
+                        {
+                            line_end[1] = p1;
+                        }
+                    }
+                    else if (inLine)
+                    {
+                        if (++gap > lineGap)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                            gap = 0;
+                            inLine = false;
+                        }
+                    }
+
+                    p1 = p1 + d;
+                    if (p1.x < 0 || p1.x >= cols || p1.y < 0 || p1.y >= rows)
+                    {
+                        if (inLine)
+                        {
+                            bool good_line = ::abs(line_end[1].x - line_end[0].x) >= lineLength ||
+                                             ::abs(line_end[1].y - line_end[0].y) >= lineLength;
+
+                            if (good_line)
+                            {
+                                const int ind = ::atomicAdd(&g_counter, 1);
+                                if (ind < maxSize)
+                                    out[ind] = make_int4(line_end[0].x, line_end[0].y, line_end[1].x, line_end[1].y);
+                            }
+
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength)
+        {
+            void* counterPtr;
+            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
+
+            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
+
+            const dim3 block(32, 8);
+            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
+
+            bindTexture(&tex_mask, mask);
+
+            houghLinesProbabilistic<<<grid, block>>>(accum,
+                                                     out, maxSize,
+                                                     rho, theta,
+                                                     lineGap, lineLength,
+                                                     mask.rows, mask.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+
+            int totalCount;
+            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
+
+            totalCount = ::min(totalCount, maxSize);
+
+            return totalCount;
+        }
+    }
+}}}
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/match_template.cu
+++ b/modules/cudaimgproc/src/cuda/match_template.cu
@@ -0,0 +1,916 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace match_template
+    {
+        __device__ __forceinline__ float sum(float v) { return v; }
+        __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+        __device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+        __device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+
+        __device__ __forceinline__ float first(float v) { return v; }
+        __device__ __forceinline__ float first(float2 v) { return v.x; }
+        __device__ __forceinline__ float first(float3 v) { return v.x; }
+        __device__ __forceinline__ float first(float4 v) { return v.x; }
+
+        __device__ __forceinline__ float mul(float a, float b) { return a * b; }
+        __device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+        __device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+        __device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+        __device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+
+        __device__ __forceinline__ float sub(float a, float b) { return a - b; }
+        __device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        __device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+        __device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+        __device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+        __device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_CCORR
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_CCORR(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                        res = res + mul(image_ptr[x + j], templ_ptr[j]);
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_CCORR(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_CCORR<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<float, 1>, matchTemplateNaive_CCORR<float, 2>, matchTemplateNaive_CCORR<float, 3>, matchTemplateNaive_CCORR<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_CCORR<uchar, 1>, matchTemplateNaive_CCORR<uchar, 2>, matchTemplateNaive_CCORR<uchar, 3>, matchTemplateNaive_CCORR<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Naive_SQDIFF
+
+        template <typename T, int cn>
+        __global__ void matchTemplateNaiveKernel_SQDIFF(int w, int h, const PtrStepb image, const PtrStepb templ, PtrStepSzf result)
+        {
+            typedef typename TypeVec<T, cn>::vec_type Type;
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef res = VecTraits<Typef>::all(0);
+                Typef delta;
+
+                for (int i = 0; i < h; ++i)
+                {
+                    const Type* image_ptr = (const Type*)image.ptr(y + i);
+                    const Type* templ_ptr = (const Type*)templ.ptr(i);
+                    for (int j = 0; j < w; ++j)
+                    {
+                        delta = sub(image_ptr[x + j], templ_ptr[j]);
+                        res = res + delta * delta;
+                    }
+                }
+
+                result.ptr(y)[x] = sum(res);
+            }
+        }
+
+        template <typename T, int cn>
+        void matchTemplateNaive_SQDIFF(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplateNaiveKernel_SQDIFF<T, cn><<<grid, threads, 0, stream>>>(templ.cols, templ.rows, image, templ, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<float, 1>, matchTemplateNaive_SQDIFF<float, 2>, matchTemplateNaive_SQDIFF<float, 3>, matchTemplateNaive_SQDIFF<float, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplateNaive_SQDIFF<uchar, 1>, matchTemplateNaive_SQDIFF<uchar, 2>, matchTemplateNaive_SQDIFF<uchar, 3>, matchTemplateNaive_SQDIFF<uchar, 4>
+            };
+
+            callers[cn](image, templ, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_8U(int w, int h, const PtrStep<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = image_sqsum_ - 2.f * ccorr + templ_sqsum;
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, int cn,
+                                             cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_8U<1>, matchTemplatePrepared_SQDIFF_8U<2>, matchTemplatePrepared_SQDIFF_8U<3>, matchTemplatePrepared_SQDIFF_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_SQDIFF_NORMED
+
+        // normAcc* are accurate normalization routines which make GPU matchTemplate
+        // consistent with CPU one
+
+        __device__ float normAcc(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 0;
+        }
+
+
+        __device__ float normAcc_SQDIFF(float num, float denum)
+        {
+            if (::fabs(num) < denum)
+                return num / denum;
+            if (::fabs(num) < denum * 1.125f)
+                return num > 0 ? 1 : -1;
+            return 1;
+        }
+
+
+        template <int cn>
+        __global__ void matchTemplatePreparedKernel_SQDIFF_NORMED_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = normAcc_SQDIFF(image_sqsum_ - 2.f * ccorr + templ_sqsum,
+                                                  sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        template <int cn>
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, cudaStream_t stream)
+        {
+            const dim3 threads(32, 8);
+            const dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_SQDIFF_NORMED_8U<cn><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum,
+                                                    PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            typedef void (*caller_t)(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result, cudaStream_t stream);
+            static const caller_t callers[] =
+            {
+                0, matchTemplatePrepared_SQDIFF_NORMED_8U<1>, matchTemplatePrepared_SQDIFF_NORMED_8U<2>, matchTemplatePrepared_SQDIFF_NORMED_8U<3>, matchTemplatePrepared_SQDIFF_NORMED_8U<4>
+            };
+
+            callers[cn](w, h, image_sqsum, templ_sqsum, result, stream);
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8U(int w, int h, float templ_sum_scale, const PtrStep<unsigned int> image_sum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_ * templ_sum_scale;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads, 0, stream>>>(w, h, (float)templ_sum / (w * h), image_sum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC2(
+                int w, int h, float templ_sum_scale_r, float templ_sum_scale_g,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC2(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                unsigned int templ_sum_r, unsigned int templ_sum_g,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
+                    image_sum_r, image_sum_g, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC3(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_8UC4(
+                int w, int h,
+                float templ_sum_scale_r,
+                float templ_sum_scale_g,
+                float templ_sum_scale_b,
+                float templ_sum_scale_a,
+                const PtrStep<unsigned int> image_sum_r,
+                const PtrStep<unsigned int> image_sum_g,
+                const PtrStep<unsigned int> image_sum_b,
+                const PtrStep<unsigned int> image_sum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float ccorr = result.ptr(y)[x];
+                result.ptr(y)[x] = ccorr - image_sum_r_ * templ_sum_scale_r
+                                         - image_sum_g_ * templ_sum_scale_g
+                                         - image_sum_b_ * templ_sum_scale_b
+                                         - image_sum_a_ * templ_sum_scale_a;
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            matchTemplatePreparedKernel_CCOFF_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h,
+                    (float)templ_sum_r / (w * h),
+                    (float)templ_sum_g / (w * h),
+                    (float)templ_sum_b / (w * h),
+                    (float)templ_sum_a / (w * h),
+                    image_sum_r, image_sum_g, image_sum_b, image_sum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // Prepared_CCOFF_NORMED
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8U(
+                int w, int h, float weight,
+                float templ_sum_scale, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum,
+                const PtrStep<unsigned long long> image_sqsum,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float ccorr = result.ptr(y)[x];
+                float image_sum_ = (float)(
+                        (image_sum.ptr(y + h)[x + w] - image_sum.ptr(y)[x + w]) -
+                        (image_sum.ptr(y + h)[x] - image_sum.ptr(y)[x]));
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[x + w] - image_sqsum.ptr(y)[x + w]) -
+                        (image_sqsum.ptr(y + h)[x] - image_sqsum.ptr(y)[x]));
+                result.ptr(y)[x] = normAcc(ccorr - image_sum_ * templ_sum_scale,
+                                           sqrtf(templ_sqsum_scale * (image_sqsum_ - weight * image_sum_ * image_sum_)));
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                    int w, int h, const PtrStepSz<unsigned int> image_sum,
+                    const PtrStepSz<unsigned long long> image_sqsum,
+                    unsigned int templ_sum, unsigned long long templ_sqsum,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale = templ_sum * weight;
+            float templ_sqsum_scale = templ_sqsum - weight * templ_sum * templ_sum;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads, 0, stream>>>(
+                    w, h, weight, templ_sum_scale, templ_sqsum_scale,
+                    image_sum, image_sqsum, result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC2(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g,
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                       + templ_sqsum_g - weight * templ_sum_g * templ_sum_g;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC2<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC3(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r
+                                             - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC3<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+
+
+        __global__ void matchTemplatePreparedKernel_CCOFF_NORMED_8UC4(
+                int w, int h, float weight,
+                float templ_sum_scale_r, float templ_sum_scale_g, float templ_sum_scale_b,
+                float templ_sum_scale_a, float templ_sqsum_scale,
+                const PtrStep<unsigned int> image_sum_r, const PtrStep<unsigned long long> image_sqsum_r,
+                const PtrStep<unsigned int> image_sum_g, const PtrStep<unsigned long long> image_sqsum_g,
+                const PtrStep<unsigned int> image_sum_b, const PtrStep<unsigned long long> image_sqsum_b,
+                const PtrStep<unsigned int> image_sum_a, const PtrStep<unsigned long long> image_sqsum_a,
+                PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sum_r_ = (float)(
+                        (image_sum_r.ptr(y + h)[x + w] - image_sum_r.ptr(y)[x + w]) -
+                        (image_sum_r.ptr(y + h)[x] - image_sum_r.ptr(y)[x]));
+                float image_sqsum_r_ = (float)(
+                        (image_sqsum_r.ptr(y + h)[x + w] - image_sqsum_r.ptr(y)[x + w]) -
+                        (image_sqsum_r.ptr(y + h)[x] - image_sqsum_r.ptr(y)[x]));
+                float image_sum_g_ = (float)(
+                        (image_sum_g.ptr(y + h)[x + w] - image_sum_g.ptr(y)[x + w]) -
+                        (image_sum_g.ptr(y + h)[x] - image_sum_g.ptr(y)[x]));
+                float image_sqsum_g_ = (float)(
+                        (image_sqsum_g.ptr(y + h)[x + w] - image_sqsum_g.ptr(y)[x + w]) -
+                        (image_sqsum_g.ptr(y + h)[x] - image_sqsum_g.ptr(y)[x]));
+                float image_sum_b_ = (float)(
+                        (image_sum_b.ptr(y + h)[x + w] - image_sum_b.ptr(y)[x + w]) -
+                        (image_sum_b.ptr(y + h)[x] - image_sum_b.ptr(y)[x]));
+                float image_sqsum_b_ = (float)(
+                        (image_sqsum_b.ptr(y + h)[x + w] - image_sqsum_b.ptr(y)[x + w]) -
+                        (image_sqsum_b.ptr(y + h)[x] - image_sqsum_b.ptr(y)[x]));
+                float image_sum_a_ = (float)(
+                        (image_sum_a.ptr(y + h)[x + w] - image_sum_a.ptr(y)[x + w]) -
+                        (image_sum_a.ptr(y + h)[x] - image_sum_a.ptr(y)[x]));
+                float image_sqsum_a_ = (float)(
+                        (image_sqsum_a.ptr(y + h)[x + w] - image_sqsum_a.ptr(y)[x + w]) -
+                        (image_sqsum_a.ptr(y + h)[x] - image_sqsum_a.ptr(y)[x]));
+
+                float num = result.ptr(y)[x] - image_sum_r_ * templ_sum_scale_r - image_sum_g_ * templ_sum_scale_g
+                                             - image_sum_b_ * templ_sum_scale_b - image_sum_a_ * templ_sum_scale_a;
+                float denum = sqrtf(templ_sqsum_scale * (image_sqsum_r_ - weight * image_sum_r_ * image_sum_r_
+                                                         + image_sqsum_g_ - weight * image_sum_g_ * image_sum_g_
+                                                         + image_sqsum_b_ - weight * image_sum_b_ * image_sum_b_
+                                                         + image_sqsum_a_ - weight * image_sum_a_ * image_sum_a_));
+                result.ptr(y)[x] = normAcc(num, denum);
+            }
+        }
+
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                    int w, int h,
+                    const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                    const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                    const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                    const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
+                    unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                    unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                    unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                    unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                    PtrStepSzf result, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            float weight = 1.f / (w * h);
+            float templ_sum_scale_r = templ_sum_r * weight;
+            float templ_sum_scale_g = templ_sum_g * weight;
+            float templ_sum_scale_b = templ_sum_b * weight;
+            float templ_sum_scale_a = templ_sum_a * weight;
+            float templ_sqsum_scale = templ_sqsum_r - weight * templ_sum_r * templ_sum_r
+                                      + templ_sqsum_g - weight * templ_sum_g * templ_sum_g
+                                      + templ_sqsum_b - weight * templ_sum_b * templ_sum_b
+                                      + templ_sqsum_a - weight * templ_sum_a * templ_sum_a;
+
+            matchTemplatePreparedKernel_CCOFF_NORMED_8UC4<<<grid, threads, 0, stream>>>(
+                    w, h, weight,
+                    templ_sum_scale_r, templ_sum_scale_g, templ_sum_scale_b, templ_sum_scale_a,
+                    templ_sqsum_scale,
+                    image_sum_r, image_sqsum_r,
+                    image_sum_g, image_sqsum_g,
+                    image_sum_b, image_sqsum_b,
+                    image_sum_a, image_sqsum_a,
+                    result);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // normalize
+
+        template <int cn>
+        __global__ void normalizeKernel_8U(
+                int w, int h, const PtrStep<unsigned long long> image_sqsum,
+                unsigned long long templ_sqsum, PtrStepSzf result)
+        {
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                float image_sqsum_ = (float)(
+                        (image_sqsum.ptr(y + h)[(x + w) * cn] - image_sqsum.ptr(y)[(x + w) * cn]) -
+                        (image_sqsum.ptr(y + h)[x * cn] - image_sqsum.ptr(y)[x * cn]));
+                result.ptr(y)[x] = normAcc(result.ptr(y)[x], sqrtf(image_sqsum_ * templ_sqsum));
+            }
+        }
+
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                normalizeKernel_8U<1><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 2:
+                normalizeKernel_8U<2><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 3:
+                normalizeKernel_8U<3><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            case 4:
+                normalizeKernel_8U<4><<<grid, threads, 0, stream>>>(w, h, image_sqsum, templ_sqsum, result);
+                break;
+            }
+
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        //////////////////////////////////////////////////////////////////////
+        // extractFirstChannel
+
+        template <int cn>
+        __global__ void extractFirstChannel_32F(const PtrStepb image, PtrStepSzf result)
+        {
+            typedef typename TypeVec<float, cn>::vec_type Typef;
+
+            int x = blockDim.x * blockIdx.x + threadIdx.x;
+            int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+            if (x < result.cols && y < result.rows)
+            {
+                Typef val = ((const Typef*)image.ptr(y))[x];
+                result.ptr(y)[x] = first(val);
+            }
+        }
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream)
+        {
+            dim3 threads(32, 8);
+            dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
+
+            switch (cn)
+            {
+            case 1:
+                extractFirstChannel_32F<1><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 2:
+                extractFirstChannel_32F<2><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 3:
+                extractFirstChannel_32F<3><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            case 4:
+                extractFirstChannel_32F<4><<<grid, threads, 0, stream>>>(image, result);
+                break;
+            }
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    } //namespace match_template
+}}} // namespace cv { namespace cuda { namespace cudev
+
+
+#endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/mean_shift.cu
+++ b/modules/cudaimgproc/src/cuda/mean_shift.cu
@@ -0,0 +1,182 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        texture<uchar4, 2> tex_meanshift;
+
+        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
+                                        size_t out_step, int cols, int rows,
+                                        int sp, int sr, int maxIter, float eps)
+        {
+            int isr2 = sr*sr;
+            uchar4 c = tex2D(tex_meanshift, x0, y0 );
+
+            // iterate meanshift procedure
+            for( int iter = 0; iter < maxIter; iter++ )
+            {
+                int count = 0;
+                int s0 = 0, s1 = 0, s2 = 0, sx = 0, sy = 0;
+                float icount;
+
+                //mean shift: process pixels in window (p-sigmaSp)x(p+sigmaSp)
+                int minx = x0-sp;
+                int miny = y0-sp;
+                int maxx = x0+sp;
+                int maxy = y0+sp;
+
+                for( int y = miny; y <= maxy; y++)
+                {
+                    int rowCount = 0;
+                    for( int x = minx; x <= maxx; x++ )
+                    {
+                        uchar4 t = tex2D( tex_meanshift, x, y );
+
+                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
+                        if( norm2 <= isr2 )
+                        {
+                            s0 += t.x; s1 += t.y; s2 += t.z;
+                            sx += x; rowCount++;
+                        }
+                    }
+                    count += rowCount;
+                    sy += y*rowCount;
+                }
+
+                if( count == 0 )
+                    break;
+
+                icount = 1.f/count;
+                int x1 = __float2int_rz(sx*icount);
+                int y1 = __float2int_rz(sy*icount);
+                s0 = __float2int_rz(s0*icount);
+                s1 = __float2int_rz(s1*icount);
+                s2 = __float2int_rz(s2*icount);
+
+                int norm2 = (s0 - c.x) * (s0 - c.x) + (s1 - c.y) * (s1 - c.y) + (s2 - c.z) * (s2 - c.z);
+
+                bool stopFlag = (x0 == x1 && y0 == y1) || (::abs(x1-x0) + ::abs(y1-y0) + norm2 <= eps);
+
+                x0 = x1; y0 = y1;
+                c.x = s0; c.y = s1; c.z = s2;
+
+                if( stopFlag )
+                    break;
+            }
+
+            int base = (blockIdx.y * blockDim.y + threadIdx.y) * out_step + (blockIdx.x * blockDim.x + threadIdx.x) * 4 * sizeof(uchar);
+            *(uchar4*)(out + base) = c;
+
+            return make_short2((short)x0, (short)y0);
+        }
+
+        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if( x0 < cols && y0 < rows )
+                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+        }
+
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+
+            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
+                                             unsigned char* outsp, size_t outspstep,
+                                             int cols, int rows,
+                                             int sp, int sr, int maxIter, float eps)
+        {
+            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
+            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if( x0 < cols && y0 < rows )
+            {
+                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
+                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+            }
+        }
+
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
+        {
+            dim3 grid(1, 1, 1);
+            dim3 threads(32, 8, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);
+
+            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
+            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
+
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    }
+}}}
+
+#endif
--- a/modules/cudaimgproc/src/cvt_color_internal.h
+++ b/modules/cudaimgproc/src/cvt_color_internal.h
@@ -0,0 +1,274 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __cvt_color_internal_h__
+#define __cvt_color_internal_h__
+
+namespace cv { namespace cuda { namespace device
+{
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
+    void name(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(name)    \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)        \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)       \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u)   \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hsv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hsv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hsv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgb_to_hls4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(rgba_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgr_to_hls4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(bgra_to_hls4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL(hls4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_lab4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_lab4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_lab4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab_to_lbgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lab4_to_lbgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_luv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgb_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lrgba_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgr_to_luv4)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(lbgra_to_luv4)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_rgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_bgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_bgra)
+
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgb)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lrgba)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgr)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv_to_lbgra)
+    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(luv4_to_lbgra)
+
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F_FULL
+}}}
+
+#endif
--- a/modules/cudaimgproc/src/generalized_hough.cpp
+++ b/modules/cudaimgproc/src/generalized_hough.cpp
@@ -0,0 +1,906 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAARITHM)
+
+Ptr<GeneralizedHoughBallard> cv::cuda::createGeneralizedHoughBallard() { throw_no_cuda(); return Ptr<GeneralizedHoughBallard>(); }
+
+Ptr<GeneralizedHoughGuil> cv::cuda::createGeneralizedHoughGuil() { throw_no_cuda(); return Ptr<GeneralizedHoughGuil>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace ght
+    {
+        template <typename T>
+        int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                             PtrStepSz<short2> r_table, int* r_sizes,
+                             short2 templCenter, int levels);
+
+        void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                      PtrStepSz<short2> r_table, const int* r_sizes,
+                                      PtrStepSzi hist,
+                                      float dp, int levels);
+        int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold);
+
+        void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist);
+        void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                                 int* sizes, int maxSize,
+                                                 float xi, float angleEpsilon, int levels,
+                                                 float2 center, float maxDist);
+        void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
+                                     float minAngle, float maxAngle, float angleStep, int angleRange,
+                                     int levels, int tMaxSize);
+        void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
+                                     float angle, float angleEpsilon,
+                                     float minScale, float maxScale, float iScaleStep, int scaleRange,
+                                     int levels, int tMaxSize);
+        void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
+                                     float angle, float angleEpsilon, float scale,
+                                     float dp,
+                                     int levels, int tMaxSize);
+        int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
+                                        float angle, int angleVotes, float scale, int scaleVotes,
+                                        float dp, int threshold);
+    }
+}}}
+
+// common
+
+namespace
+{
+    class GeneralizedHoughBase
+    {
+    protected:
+        GeneralizedHoughBase();
+        virtual ~GeneralizedHoughBase() {}
+
+        void setTemplateImpl(InputArray templ, Point templCenter);
+        void setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter);
+
+        void detectImpl(InputArray image, OutputArray positions, OutputArray votes);
+        void detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes);
+
+        void buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy);
+
+        virtual void processTempl() = 0;
+        virtual void processImage() = 0;
+
+        int cannyLowThresh_;
+        int cannyHighThresh_;
+        double minDist_;
+        double dp_;
+        int maxBufferSize_;
+
+        Size templSize_;
+        Point templCenter_;
+        GpuMat templEdges_;
+        GpuMat templDx_;
+        GpuMat templDy_;
+
+        Size imageSize_;
+        GpuMat imageEdges_;
+        GpuMat imageDx_;
+        GpuMat imageDy_;
+
+        GpuMat edgePointList_;
+
+        GpuMat outBuf_;
+        int posCount_;
+
+    private:
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        void calcEdges(InputArray src, GpuMat& edges, GpuMat& dx, GpuMat& dy);
+#endif
+
+        void filterMinDist();
+        void convertTo(OutputArray positions, OutputArray votes);
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        Ptr<cuda::CannyEdgeDetector> canny_;
+        Ptr<cuda::Filter> filterDx_;
+        Ptr<cuda::Filter> filterDy_;
+#endif
+
+        std::vector<float4> oldPosBuf_;
+        std::vector<int3> oldVoteBuf_;
+        std::vector<float4> newPosBuf_;
+        std::vector<int3> newVoteBuf_;
+        std::vector<int> indexies_;
+    };
+
+    GeneralizedHoughBase::GeneralizedHoughBase()
+    {
+        cannyLowThresh_ = 50;
+        cannyHighThresh_ = 100;
+        minDist_ = 1.0;
+        dp_ = 1.0;
+
+        maxBufferSize_ = 10000;
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+        canny_ = cuda::createCannyEdgeDetector(cannyLowThresh_, cannyHighThresh_);
+        filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
+        filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
+#endif
+    }
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+    void GeneralizedHoughBase::calcEdges(InputArray _src, GpuMat& edges, GpuMat& dx, GpuMat& dy)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( cannyLowThresh_ > 0 && cannyLowThresh_ < cannyHighThresh_ );
+
+        ensureSizeIsEnough(src.size(), CV_32SC1, dx);
+        ensureSizeIsEnough(src.size(), CV_32SC1, dy);
+
+        filterDx_->apply(src, dx);
+        filterDy_->apply(src, dy);
+
+        ensureSizeIsEnough(src.size(), CV_8UC1, edges);
+
+        canny_->setLowThreshold(cannyLowThresh_);
+        canny_->setHighThreshold(cannyHighThresh_);
+        canny_->detect(dx, dy, edges);
+    }
+#endif
+
+    void GeneralizedHoughBase::setTemplateImpl(InputArray templ, Point templCenter)
+    {
+#ifndef HAVE_OPENCV_CUDAFILTERS
+        (void) templ;
+        (void) templCenter;
+        throw_no_cuda();
+#else
+        calcEdges(templ, templEdges_, templDx_, templDy_);
+
+        if (templCenter == Point(-1, -1))
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
+
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
+
+        processTempl();
+#endif
+    }
+
+    void GeneralizedHoughBase::setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter)
+    {
+        edges.getGpuMat().copyTo(templEdges_);
+        dx.getGpuMat().copyTo(templDx_);
+        dy.getGpuMat().copyTo(templDy_);
+
+        CV_Assert( templEdges_.type() == CV_8UC1 );
+        CV_Assert( templDx_.type() == CV_32FC1 && templDx_.size() == templEdges_.size() );
+        CV_Assert( templDy_.type() == templDx_.type() && templDy_.size() == templEdges_.size() );
+
+        if (templCenter == Point(-1, -1))
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
+
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
+
+        processTempl();
+    }
+
+    void GeneralizedHoughBase::detectImpl(InputArray image, OutputArray positions, OutputArray votes)
+    {
+#ifndef HAVE_OPENCV_CUDAFILTERS
+        (void) templ;
+        (void) templCenter;
+        throw_no_cuda();
+#else
+        calcEdges(image, imageEdges_, imageDx_, imageDy_);
+
+        imageSize_ = imageEdges_.size();
+
+        posCount_ = 0;
+
+        processImage();
+
+        if (posCount_ == 0)
+        {
+            positions.release();
+            if (votes.needed())
+                votes.release();
+        }
+        else
+        {
+            if (minDist_ > 1)
+                filterMinDist();
+            convertTo(positions, votes);
+        }
+#endif
+    }
+
+    void GeneralizedHoughBase::detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes)
+    {
+        edges.getGpuMat().copyTo(imageEdges_);
+        dx.getGpuMat().copyTo(imageDx_);
+        dy.getGpuMat().copyTo(imageDy_);
+
+        CV_Assert( imageEdges_.type() == CV_8UC1 );
+        CV_Assert( imageDx_.type() == CV_32FC1 && imageDx_.size() == imageEdges_.size() );
+        CV_Assert( imageDy_.type() == imageDx_.type() && imageDy_.size() == imageEdges_.size() );
+
+        imageSize_ = imageEdges_.size();
+
+        posCount_ = 0;
+
+        processImage();
+
+        if (posCount_ == 0)
+        {
+            positions.release();
+            if (votes.needed())
+                votes.release();
+        }
+        else
+        {
+            if (minDist_ > 1)
+                filterMinDist();
+            convertTo(positions, votes);
+        }
+    }
+
+    void GeneralizedHoughBase::buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy)
+    {
+        using namespace cv::cuda::device::ght;
+
+        typedef int (*func_t)(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
+        static const func_t funcs[] =
+        {
+            0,
+            0,
+            0,
+            buildEdgePointList_gpu<short>,
+            buildEdgePointList_gpu<int>,
+            buildEdgePointList_gpu<float>,
+            0
+        };
+
+        CV_Assert( edges.type() == CV_8UC1 );
+        CV_Assert( dx.size() == edges.size() );
+        CV_Assert( dy.type() == dx.type() && dy.size() == edges.size() );
+
+        const func_t func = funcs[dx.depth()];
+        CV_Assert( func != 0 );
+
+        edgePointList_.cols = (int) (edgePointList_.step / sizeof(int));
+        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList_);
+
+        edgePointList_.cols = func(edges, dx, dy, edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1));
+    }
+
+    struct IndexCmp
+    {
+        const int3* aux;
+
+        explicit IndexCmp(const int3* _aux) : aux(_aux) {}
+
+        bool operator ()(int l1, int l2) const
+        {
+            return aux[l1].x > aux[l2].x;
+        }
+    };
+
+    void GeneralizedHoughBase::filterMinDist()
+    {
+        oldPosBuf_.resize(posCount_);
+        oldVoteBuf_.resize(posCount_);
+
+        cudaSafeCall( cudaMemcpy(&oldPosBuf_[0], outBuf_.ptr(0), posCount_ * sizeof(float4), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&oldVoteBuf_[0], outBuf_.ptr(1), posCount_ * sizeof(int3), cudaMemcpyDeviceToHost) );
+
+        indexies_.resize(posCount_);
+        for (int i = 0; i < posCount_; ++i)
+            indexies_[i] = i;
+        std::sort(indexies_.begin(), indexies_.end(), IndexCmp(&oldVoteBuf_[0]));
+
+        newPosBuf_.clear();
+        newVoteBuf_.clear();
+        newPosBuf_.reserve(posCount_);
+        newVoteBuf_.reserve(posCount_);
+
+        const int cellSize = cvRound(minDist_);
+        const int gridWidth = (imageSize_.width + cellSize - 1) / cellSize;
+        const int gridHeight = (imageSize_.height + cellSize - 1) / cellSize;
+
+        std::vector< std::vector<Point2f> > grid(gridWidth * gridHeight);
+
+        const double minDist2 = minDist_ * minDist_;
+
+        for (int i = 0; i < posCount_; ++i)
+        {
+            const int ind = indexies_[i];
+
+            Point2f p(oldPosBuf_[ind].x, oldPosBuf_[ind].y);
+
+            bool good = true;
+
+            const int xCell = static_cast<int>(p.x / cellSize);
+            const int yCell = static_cast<int>(p.y / cellSize);
+
+            int x1 = xCell - 1;
+            int y1 = yCell - 1;
+            int x2 = xCell + 1;
+            int y2 = yCell + 1;
+
+            // boundary check
+            x1 = std::max(0, x1);
+            y1 = std::max(0, y1);
+            x2 = std::min(gridWidth - 1, x2);
+            y2 = std::min(gridHeight - 1, y2);
+
+            for (int yy = y1; yy <= y2; ++yy)
+            {
+                for (int xx = x1; xx <= x2; ++xx)
+                {
+                    const std::vector<Point2f>& m = grid[yy * gridWidth + xx];
+
+                    for(size_t j = 0; j < m.size(); ++j)
+                    {
+                        const Point2f d = p - m[j];
+
+                        if (d.ddot(d) < minDist2)
+                        {
+                            good = false;
+                            goto break_out;
+                        }
+                    }
+                }
+            }
+
+            break_out:
+
+            if(good)
+            {
+                grid[yCell * gridWidth + xCell].push_back(p);
+
+                newPosBuf_.push_back(oldPosBuf_[ind]);
+                newVoteBuf_.push_back(oldVoteBuf_[ind]);
+            }
+        }
+
+        posCount_ = static_cast<int>(newPosBuf_.size());
+        cudaSafeCall( cudaMemcpy(outBuf_.ptr(0), &newPosBuf_[0], posCount_ * sizeof(float4), cudaMemcpyHostToDevice) );
+        cudaSafeCall( cudaMemcpy(outBuf_.ptr(1), &newVoteBuf_[0], posCount_ * sizeof(int3), cudaMemcpyHostToDevice) );
+    }
+
+    void GeneralizedHoughBase::convertTo(OutputArray positions, OutputArray votes)
+    {
+        ensureSizeIsEnough(1, posCount_, CV_32FC4, positions);
+        GpuMat(1, posCount_, CV_32FC4, outBuf_.ptr(0), outBuf_.step).copyTo(positions);
+
+        if (votes.needed())
+        {
+            ensureSizeIsEnough(1, posCount_, CV_32FC3, votes);
+            GpuMat(1, posCount_, CV_32FC4, outBuf_.ptr(1), outBuf_.step).copyTo(votes);
+        }
+    }
+}
+
+// GeneralizedHoughBallard
+
+namespace
+{
+    class GeneralizedHoughBallardImpl : public GeneralizedHoughBallard, private GeneralizedHoughBase
+    {
+    public:
+        GeneralizedHoughBallardImpl();
+
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
+
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
+
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
+
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
+
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
+
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
+
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setVotesThreshold(int votesThreshold) { votesThreshold_ = votesThreshold; }
+        int getVotesThreshold() const { return votesThreshold_; }
+
+    private:
+        void processTempl();
+        void processImage();
+
+        void calcHist();
+        void findPosInHist();
+
+        int levels_;
+        int votesThreshold_;
+
+        GpuMat r_table_;
+        GpuMat r_sizes_;
+
+        GpuMat hist_;
+    };
+
+    GeneralizedHoughBallardImpl::GeneralizedHoughBallardImpl()
+    {
+        levels_ = 360;
+        votesThreshold_ = 100;
+    }
+
+    void GeneralizedHoughBallardImpl::processTempl()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( levels_ > 0 );
+
+        buildEdgePointList(templEdges_, templDx_, templDy_);
+
+        ensureSizeIsEnough(levels_ + 1, maxBufferSize_, CV_16SC2, r_table_);
+        ensureSizeIsEnough(1, levels_ + 1, CV_32SC1, r_sizes_);
+        r_sizes_.setTo(Scalar::all(0));
+
+        if (edgePointList_.cols > 0)
+        {
+            buildRTable_gpu(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                            r_table_, r_sizes_.ptr<int>(), make_short2(templCenter_.x, templCenter_.y), levels_);
+            cuda::min(r_sizes_, maxBufferSize_, r_sizes_);
+        }
+    }
+
+    void GeneralizedHoughBallardImpl::processImage()
+    {
+        calcHist();
+        findPosInHist();
+    }
+
+    void GeneralizedHoughBallardImpl::calcHist()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( levels_ > 0 && r_table_.rows == (levels_ + 1) && r_sizes_.cols == (levels_ + 1) );
+        CV_Assert( dp_ > 0.0);
+
+        const double idp = 1.0 / dp_;
+
+        buildEdgePointList(imageEdges_, imageDx_, imageDy_);
+
+        ensureSizeIsEnough(cvCeil(imageSize_.height * idp) + 2, cvCeil(imageSize_.width * idp) + 2, CV_32SC1, hist_);
+        hist_.setTo(Scalar::all(0));
+
+        if (edgePointList_.cols > 0)
+        {
+            Ballard_Pos_calcHist_gpu(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                                     r_table_, r_sizes_.ptr<int>(),
+                                     hist_,
+                                     (float)dp_, levels_);
+        }
+    }
+
+    void GeneralizedHoughBallardImpl::findPosInHist()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( votesThreshold_ > 0 );
+
+        ensureSizeIsEnough(2, maxBufferSize_, CV_32FC4, outBuf_);
+
+        posCount_ = Ballard_Pos_findPosInHist_gpu(hist_, outBuf_.ptr<float4>(0), outBuf_.ptr<int3>(1), maxBufferSize_, (float)dp_, votesThreshold_);
+    }
+}
+
+Ptr<GeneralizedHoughBallard> cv::cuda::createGeneralizedHoughBallard()
+{
+    return new GeneralizedHoughBallardImpl;
+}
+
+// GeneralizedHoughGuil
+
+namespace
+{
+    class GeneralizedHoughGuilImpl : public GeneralizedHoughGuil, private GeneralizedHoughBase
+    {
+    public:
+        GeneralizedHoughGuilImpl();
+
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
+
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
+
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
+
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
+
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
+
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
+
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
+
+        void setXi(double xi) { xi_ = xi; }
+        double getXi() const { return xi_; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setAngleEpsilon(double angleEpsilon) { angleEpsilon_ = angleEpsilon; }
+        double getAngleEpsilon() const { return angleEpsilon_; }
+
+        void setMinAngle(double minAngle) { minAngle_ = minAngle; }
+        double getMinAngle() const { return minAngle_; }
+
+        void setMaxAngle(double maxAngle) { maxAngle_ = maxAngle; }
+        double getMaxAngle() const { return maxAngle_; }
+
+        void setAngleStep(double angleStep) { angleStep_ = angleStep; }
+        double getAngleStep() const { return angleStep_; }
+
+        void setAngleThresh(int angleThresh) { angleThresh_ = angleThresh; }
+        int getAngleThresh() const { return angleThresh_; }
+
+        void setMinScale(double minScale) { minScale_ = minScale; }
+        double getMinScale() const { return minScale_; }
+
+        void setMaxScale(double maxScale) { maxScale_ = maxScale; }
+        double getMaxScale() const { return maxScale_; }
+
+        void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+        double getScaleStep() const { return scaleStep_; }
+
+        void setScaleThresh(int scaleThresh) { scaleThresh_ = scaleThresh; }
+        int getScaleThresh() const { return scaleThresh_; }
+
+        void setPosThresh(int posThresh) { posThresh_ = posThresh; }
+        int getPosThresh() const { return posThresh_; }
+
+    private:
+        void processTempl();
+        void processImage();
+
+        double xi_;
+        int levels_;
+        double angleEpsilon_;
+
+        double minAngle_;
+        double maxAngle_;
+        double angleStep_;
+        int angleThresh_;
+
+        double minScale_;
+        double maxScale_;
+        double scaleStep_;
+        int scaleThresh_;
+
+        int posThresh_;
+
+        struct Feature
+        {
+            GpuMat p1_pos;
+            GpuMat p1_theta;
+            GpuMat p2_pos;
+
+            GpuMat d12;
+
+            GpuMat r1;
+            GpuMat r2;
+
+            GpuMat sizes;
+            int maxSize;
+
+            void create(int levels, int maxCapacity, bool isTempl);
+        };
+
+        typedef void (*set_func_t)(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        typedef void (*build_func_t)(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                     int* sizes, int maxSize,
+                                     float xi, float angleEpsilon, int levels,
+                                     float2 center, float maxDist);
+
+        void buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                              set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center = Point2d());
+
+        void calcOrientation();
+        void calcScale(double angle);
+        void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
+
+        Feature templFeatures_;
+        Feature imageFeatures_;
+
+        std::vector< std::pair<double, int> > angles_;
+        std::vector< std::pair<double, int> > scales_;
+
+        GpuMat hist_;
+        std::vector<int> h_buf_;
+    };
+
+    double toRad(double a)
+    {
+        return a * CV_PI / 180.0;
+    }
+
+    double clampAngle(double a)
+    {
+        double res = a;
+
+        while (res > 360.0)
+            res -= 360.0;
+        while (res < 0)
+            res += 360.0;
+
+        return res;
+    }
+
+    bool angleEq(double a, double b, double eps = 1.0)
+    {
+        return (fabs(clampAngle(a - b)) <= eps);
+    }
+
+    GeneralizedHoughGuilImpl::GeneralizedHoughGuilImpl()
+    {
+        maxBufferSize_ = 1000;
+
+        xi_ = 90.0;
+        levels_ = 360;
+        angleEpsilon_ = 1.0;
+
+        minAngle_ = 0.0;
+        maxAngle_ = 360.0;
+        angleStep_ = 1.0;
+        angleThresh_ = 15000;
+
+        minScale_ = 0.5;
+        maxScale_ = 2.0;
+        scaleStep_ = 0.05;
+        scaleThresh_ = 1000;
+
+        posThresh_ = 100;
+    }
+
+    void GeneralizedHoughGuilImpl::processTempl()
+    {
+        using namespace cv::cuda::device::ght;
+
+        buildFeatureList(templEdges_, templDx_, templDy_, templFeatures_,
+            Guil_Full_setTemplFeatures, Guil_Full_buildTemplFeatureList_gpu,
+            true, templCenter_);
+
+        h_buf_.resize(templFeatures_.sizes.cols);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], templFeatures_.sizes.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+        templFeatures_.maxSize = *std::max_element(h_buf_.begin(), h_buf_.end());
+    }
+
+    void GeneralizedHoughGuilImpl::processImage()
+    {
+        using namespace cv::cuda::device::ght;
+
+        CV_Assert( levels_ > 0 );
+        CV_Assert( templFeatures_.sizes.cols == levels_ + 1 );
+        CV_Assert( minAngle_ >= 0.0 && minAngle_ < maxAngle_ && maxAngle_ <= 360.0 );
+        CV_Assert( angleStep_ > 0.0 && angleStep_ < 360.0 );
+        CV_Assert( angleThresh_ > 0 );
+        CV_Assert( minScale_ > 0.0 && minScale_ < maxScale_ );
+        CV_Assert( scaleStep_ > 0.0 );
+        CV_Assert( scaleThresh_ > 0 );
+        CV_Assert( dp_ > 0.0 );
+        CV_Assert( posThresh_ > 0 );
+
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
+
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
+
+        const double idp = 1.0 / dp_;
+        const int histRows = cvCeil(imageSize_.height * idp);
+        const int histCols = cvCeil(imageSize_.width * idp);
+
+        ensureSizeIsEnough(histRows + 2, std::max(angleRange + 1, std::max(scaleRange + 1, histCols + 2)), CV_32SC1, hist_);
+        h_buf_.resize(std::max(angleRange + 1, scaleRange + 1));
+
+        ensureSizeIsEnough(2, maxBufferSize_, CV_32FC4, outBuf_);
+
+        buildFeatureList(imageEdges_, imageDx_, imageDy_, imageFeatures_,
+            Guil_Full_setImageFeatures, Guil_Full_buildImageFeatureList_gpu,
+            false);
+
+        calcOrientation();
+
+        for (size_t i = 0; i < angles_.size(); ++i)
+        {
+            const double angle = angles_[i].first;
+            const int angleVotes = angles_[i].second;
+
+            calcScale(angle);
+
+            for (size_t j = 0; j < scales_.size(); ++j)
+            {
+                const double scale = scales_[j].first;
+                const int scaleVotes = scales_[j].second;
+
+                calcPosition(angle, angleVotes, scale, scaleVotes);
+            }
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::Feature::create(int levels, int maxCapacity, bool isTempl)
+    {
+        if (!isTempl)
+        {
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p1_pos);
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, p2_pos);
+        }
+
+        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, p1_theta);
+
+        ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC1, d12);
+
+        if (isTempl)
+        {
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r1);
+            ensureSizeIsEnough(levels + 1, maxCapacity, CV_32FC2, r2);
+        }
+
+        ensureSizeIsEnough(1, levels + 1, CV_32SC1, sizes);
+        sizes.setTo(Scalar::all(0));
+
+        maxSize = 0;
+    }
+
+    void GeneralizedHoughGuilImpl::buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                                                    set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center)
+    {
+        CV_Assert( levels_ > 0 );
+
+        const double maxDist = sqrt((double) templSize_.width * templSize_.width + templSize_.height * templSize_.height) * maxScale_;
+
+        features.create(levels_, maxBufferSize_, isTempl);
+        set_func(features.p1_pos, features.p1_theta, features.p2_pos, features.d12, features.r1, features.r2);
+
+        buildEdgePointList(edges, dx, dy);
+
+        if (edgePointList_.cols > 0)
+        {
+            build_func(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                features.sizes.ptr<int>(), maxBufferSize_, (float)xi_, (float)angleEpsilon_, levels_, make_float2((float)center.x, (float)center.y), (float)maxDist);
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::calcOrientation()
+    {
+        using namespace cv::cuda::device::ght;
+
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
+
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcOHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_.ptr<int>(),
+                                (float)minAngle_, (float)maxAngle_, (float)angleStep_, angleRange, levels_, templFeatures_.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], hist_.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+
+        angles_.clear();
+
+        for (int n = 0; n < angleRange; ++n)
+        {
+            if (h_buf_[n] >= angleThresh_)
+            {
+                const double angle = minAngle_ + n * angleStep_;
+                angles_.push_back(std::make_pair(angle, h_buf_[n]));
+            }
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::calcScale(double angle)
+    {
+        using namespace cv::cuda::device::ght;
+
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
+
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcSHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_.ptr<int>(),
+                                (float)angle, (float)angleEpsilon_, (float)minScale_, (float)maxScale_,
+                                (float)iScaleStep, scaleRange, levels_, templFeatures_.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], hist_.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+
+        scales_.clear();
+
+        for (int s = 0; s < scaleRange; ++s)
+        {
+            if (h_buf_[s] >= scaleThresh_)
+            {
+                const double scale = minScale_ + s * scaleStep_;
+                scales_.push_back(std::make_pair(scale, h_buf_[s]));
+            }
+        }
+    }
+
+    void GeneralizedHoughGuilImpl::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
+    {
+        using namespace cv::cuda::device::ght;
+
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcPHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_,
+                                (float)angle, (float)angleEpsilon_, (float)scale, (float)dp_, levels_, templFeatures_.maxSize);
+
+        posCount_ = Guil_Full_findPosInHist_gpu(hist_, outBuf_.ptr<float4>(0), outBuf_.ptr<int3>(1),
+                                                posCount_, maxBufferSize_, (float)angle, angleVotes,
+                                                (float)scale, scaleVotes, (float)dp_, posThresh_);
+    }
+}
+
+Ptr<GeneralizedHoughGuil> cv::cuda::createGeneralizedHoughGuil()
+{
+    return new GeneralizedHoughGuilImpl;
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@@ -0,0 +1,215 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAARITHM)
+
+Ptr<cuda::CornersDetector> cv::cuda::createGoodFeaturesToTrackDetector(int, int, double, double, int, bool, double) { throw_no_cuda(); return Ptr<cuda::CornersDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace gfft
+    {
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count);
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count);
+    }
+}}}
+
+namespace
+{
+    class GoodFeaturesToTrackDetector : public CornersDetector
+    {
+    public:
+        GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                    int blockSize, bool useHarrisDetector, double harrisK);
+
+        void detect(InputArray image, OutputArray corners, InputArray mask = noArray());
+
+    private:
+        int maxCorners_;
+        double qualityLevel_;
+        double minDistance_;
+
+        Ptr<cuda::CornernessCriteria> cornerCriteria_;
+
+        GpuMat Dx_;
+        GpuMat Dy_;
+        GpuMat buf_;
+        GpuMat eig_;
+        GpuMat minMaxbuf_;
+        GpuMat tmpCorners_;
+    };
+
+    GoodFeaturesToTrackDetector::GoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                                             int blockSize, bool useHarrisDetector, double harrisK) :
+        maxCorners_(maxCorners), qualityLevel_(qualityLevel), minDistance_(minDistance)
+    {
+        CV_Assert( qualityLevel_ > 0 && minDistance_ >= 0 && maxCorners_ >= 0 );
+
+        cornerCriteria_ = useHarrisDetector ?
+                    cuda::createHarrisCorner(srcType, blockSize, 3, harrisK) :
+                    cuda::createMinEigenValCorner(srcType, blockSize, 3);
+    }
+
+    void GoodFeaturesToTrackDetector::detect(InputArray _image, OutputArray _corners, InputArray _mask)
+    {
+        using namespace cv::cuda::device::gfft;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat mask = _mask.getGpuMat();
+
+        CV_Assert( mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()) );
+
+        ensureSizeIsEnough(image.size(), CV_32FC1, eig_);
+        cornerCriteria_->compute(image, eig_);
+
+        double maxVal = 0;
+        cuda::minMax(eig_, 0, &maxVal, noArray(), minMaxbuf_);
+
+        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
+
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols);
+
+        if (total == 0)
+        {
+            _corners.release();
+            return;
+        }
+
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total);
+
+        if (minDistance_ < 1)
+        {
+            tmpCorners_.colRange(0, maxCorners_ > 0 ? std::min(maxCorners_, total) : total).copyTo(_corners);
+        }
+        else
+        {
+            std::vector<Point2f> tmp(total);
+            Mat tmpMat(1, total, CV_32FC2, (void*)&tmp[0]);
+            tmpCorners_.colRange(0, total).download(tmpMat);
+
+            std::vector<Point2f> tmp2;
+            tmp2.reserve(total);
+
+            const int cell_size = cvRound(minDistance_);
+            const int grid_width = (image.cols + cell_size - 1) / cell_size;
+            const int grid_height = (image.rows + cell_size - 1) / cell_size;
+
+            std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
+
+            for (int i = 0; i < total; ++i)
+            {
+                Point2f p = tmp[i];
+
+                bool good = true;
+
+                int x_cell = static_cast<int>(p.x / cell_size);
+                int y_cell = static_cast<int>(p.y / cell_size);
+
+                int x1 = x_cell - 1;
+                int y1 = y_cell - 1;
+                int x2 = x_cell + 1;
+                int y2 = y_cell + 1;
+
+                // boundary check
+                x1 = std::max(0, x1);
+                y1 = std::max(0, y1);
+                x2 = std::min(grid_width - 1, x2);
+                y2 = std::min(grid_height - 1, y2);
+
+                for (int yy = y1; yy <= y2; yy++)
+                {
+                    for (int xx = x1; xx <= x2; xx++)
+                    {
+                        std::vector<Point2f>& m = grid[yy * grid_width + xx];
+
+                        if (!m.empty())
+                        {
+                            for(size_t j = 0; j < m.size(); j++)
+                            {
+                                float dx = p.x - m[j].x;
+                                float dy = p.y - m[j].y;
+
+                                if (dx * dx + dy * dy < minDistance_ * minDistance_)
+                                {
+                                    good = false;
+                                    goto break_out;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                break_out:
+
+                if(good)
+                {
+                    grid[y_cell * grid_width + x_cell].push_back(p);
+
+                    tmp2.push_back(p);
+
+                    if (maxCorners_ > 0 && tmp2.size() == static_cast<size_t>(maxCorners_))
+                        break;
+                }
+            }
+
+            _corners.create(1, static_cast<int>(tmp2.size()), CV_32FC2);
+            GpuMat corners = _corners.getGpuMat();
+
+            corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
+        }
+    }
+}
+
+Ptr<cuda::CornersDetector> cv::cuda::createGoodFeaturesToTrackDetector(int srcType, int maxCorners, double qualityLevel, double minDistance,
+                                                                     int blockSize, bool useHarrisDetector, double harrisK)
+{
+    return new GoodFeaturesToTrackDetector(srcType, maxCorners, qualityLevel, minDistance, blockSize, useHarrisDetector, harrisK);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@@ -0,0 +1,579 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::calcHist(InputArray, OutputArray, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::equalizeHist(InputArray, OutputArray, InputOutputArray, Stream&) { throw_no_cuda(); }
+
+cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double, cv::Size) { throw_no_cuda(); return cv::Ptr<cv::cuda::CLAHE>(); }
+
+void cv::cuda::evenLevels(OutputArray, int, int, int) { throw_no_cuda(); }
+
+void cv::cuda::histEven(InputArray, OutputArray, InputOutputArray, int, int, int, Stream&) { throw_no_cuda(); }
+void cv::cuda::histEven(InputArray, GpuMat*, InputOutputArray, int*, int*, int*, Stream&) { throw_no_cuda(); }
+
+void cv::cuda::histRange(InputArray, OutputArray, InputArray, InputOutputArray, Stream&) { throw_no_cuda(); }
+void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, InputOutputArray, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// calcHist
+
+namespace hist
+{
+    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
+}
+
+void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    _hist.create(1, 256, CV_32SC1);
+    GpuMat hist = _hist.getGpuMat();
+
+    hist.setTo(Scalar::all(0), stream);
+
+    hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// equalizeHist
+
+namespace hist
+{
+    void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
+}
+
+void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, InputOutputArray _buf, Stream& _stream)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 );
+
+    _dst.create(src.size(), src.type());
+    GpuMat dst = _dst.getGpuMat();
+
+    int intBufSize;
+    nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
+
+    size_t bufSize = intBufSize + 2 * 256 * sizeof(int);
+
+    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, _buf);
+    GpuMat buf = _buf.getGpuMat();
+
+    GpuMat hist(1, 256, CV_32SC1, buf.data);
+    GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
+    GpuMat intBuf(1, intBufSize, CV_8UC1, buf.data + 2 * 256 * sizeof(int));
+
+    cuda::calcHist(src, hist, _stream);
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    NppStreamHandler h(stream);
+
+    nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
+
+    hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
+}
+
+////////////////////////////////////////////////////////////////////////
+// CLAHE
+
+namespace clahe
+{
+    void calcLut(PtrStepSzb src, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, int clipLimit, float lutScale, cudaStream_t stream);
+    void transform(PtrStepSzb src, PtrStepSzb dst, PtrStepb lut, int tilesX, int tilesY, int2 tileSize, cudaStream_t stream);
+}
+
+namespace
+{
+    class CLAHE_Impl : public cv::cuda::CLAHE
+    {
+    public:
+        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+        cv::AlgorithmInfo* info() const;
+
+        void apply(cv::InputArray src, cv::OutputArray dst);
+        void apply(InputArray src, OutputArray dst, Stream& stream);
+
+        void setClipLimit(double clipLimit);
+        double getClipLimit() const;
+
+        void setTilesGridSize(cv::Size tileGridSize);
+        cv::Size getTilesGridSize() const;
+
+        void collectGarbage();
+
+    private:
+        double clipLimit_;
+        int tilesX_;
+        int tilesY_;
+
+        GpuMat srcExt_;
+        GpuMat lut_;
+    };
+
+    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+    {
+    }
+
+    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE_GPU",
+        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
+        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
+        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
+
+    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
+    {
+        apply(_src, _dst, Stream::Null());
+    }
+
+    void CLAHE_Impl::apply(InputArray _src, OutputArray _dst, Stream& s)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+
+        _dst.create( src.size(), src.type() );
+        GpuMat dst = _dst.getGpuMat();
+
+        const int histSize = 256;
+
+        ensureSizeIsEnough(tilesX_ * tilesY_, histSize, CV_8UC1, lut_);
+
+        cudaStream_t stream = StreamAccessor::getStream(s);
+
+        cv::Size tileSize;
+        GpuMat srcForLut;
+
+        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        {
+            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            srcForLut = src;
+        }
+        else
+        {
+#ifndef HAVE_OPENCV_CUDAARITHM
+            throw_no_cuda();
+#else
+            cv::cuda::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101, cv::Scalar(), s);
+#endif
+
+            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+            srcForLut = srcExt_;
+        }
+
+        const int tileSizeTotal = tileSize.area();
+        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+        int clipLimit = 0;
+        if (clipLimit_ > 0.0)
+        {
+            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+            clipLimit = std::max(clipLimit, 1);
+        }
+
+        clahe::calcLut(srcForLut, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), clipLimit, lutScale, stream);
+
+        clahe::transform(src, dst, lut_, tilesX_, tilesY_, make_int2(tileSize.width, tileSize.height), stream);
+    }
+
+    void CLAHE_Impl::setClipLimit(double clipLimit)
+    {
+        clipLimit_ = clipLimit;
+    }
+
+    double CLAHE_Impl::getClipLimit() const
+    {
+        return clipLimit_;
+    }
+
+    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+    {
+        tilesX_ = tileGridSize.width;
+        tilesY_ = tileGridSize.height;
+    }
+
+    cv::Size CLAHE_Impl::getTilesGridSize() const
+    {
+        return cv::Size(tilesX_, tilesY_);
+    }
+
+    void CLAHE_Impl::collectGarbage()
+    {
+        srcExt_.release();
+        lut_.release();
+    }
+}
+
+cv::Ptr<cv::cuda::CLAHE> cv::cuda::createCLAHE(double clipLimit, cv::Size tileGridSize)
+{
+    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+}
+
+////////////////////////////////////////////////////////////////////////
+// NPP Histogram
+
+namespace
+{
+    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
+    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
+
+    template<int SDEPTH> struct NppHistogramEvenFuncC1
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+    typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
+            int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
+    };
+    template<int SDEPTH> struct NppHistogramEvenFuncC4
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
+            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
+    };
+
+    template<int SDEPTH, typename NppHistogramEvenFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
+    struct NppHistogramEvenC1
+    {
+        typedef typename NppHistogramEvenFuncC1<SDEPTH>::src_t src_t;
+
+        static void hist(const GpuMat& src, OutputArray _hist, InputOutputArray _buf, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+        {
+            const int levels = histSize + 1;
+
+            _hist.create(1, histSize, CV_32S);
+            GpuMat hist = _hist.getGpuMat();
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, levels, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8UC1, _buf);
+            GpuMat buf = _buf.getGpuMat();
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels,
+                lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppHistogramEvenFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
+    struct NppHistogramEvenC4
+    {
+        typedef typename NppHistogramEvenFuncC4<SDEPTH>::src_t src_t;
+
+        static void hist(const GpuMat& src, GpuMat hist[4],InputOutputArray _buf, int histSize[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream)
+        {
+            int levels[] = {histSize[0] + 1, histSize[1] + 1, histSize[2] + 1, histSize[3] + 1};
+            hist[0].create(1, histSize[0], CV_32S);
+            hist[1].create(1, histSize[1], CV_32S);
+            hist[2].create(1, histSize[2], CV_32S);
+            hist[3].create(1, histSize[3], CV_32S);
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
+
+            int buf_size;
+            get_buf_size(sz, levels, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
+            GpuMat buf = _buf.getGpuMat();
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, levels, lowerLevel, upperLevel, buf.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template<int SDEPTH> struct NppHistogramRangeFuncC1
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef Npp32s level_t;
+        enum {LEVEL_TYPE_CODE=CV_32SC1};
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32s* pLevels, int nLevels, Npp8u* pBuffer);
+    };
+    template<> struct NppHistogramRangeFuncC1<CV_32F>
+    {
+        typedef Npp32f src_t;
+        typedef Npp32f level_t;
+        enum {LEVEL_TYPE_CODE=CV_32FC1};
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist,
+            const Npp32f* pLevels, int nLevels, Npp8u* pBuffer);
+    };
+    template<int SDEPTH> struct NppHistogramRangeFuncC4
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef Npp32s level_t;
+        enum {LEVEL_TYPE_CODE=CV_32SC1};
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32s* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+    };
+    template<> struct NppHistogramRangeFuncC4<CV_32F>
+    {
+        typedef Npp32f src_t;
+        typedef Npp32f level_t;
+        enum {LEVEL_TYPE_CODE=CV_32FC1};
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s* pHist[4],
+            const Npp32f* pLevels[4], int nLevels[4], Npp8u* pBuffer);
+    };
+
+    template<int SDEPTH, typename NppHistogramRangeFuncC1<SDEPTH>::func_ptr func, get_buf_size_c1_t get_buf_size>
+    struct NppHistogramRangeC1
+    {
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::src_t src_t;
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
+        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
+
+        static void hist(const GpuMat& src, OutputArray _hist, const GpuMat& levels, InputOutputArray _buf, cudaStream_t stream)
+        {
+            CV_Assert( levels.type() == LEVEL_TYPE_CODE && levels.rows == 1 );
+
+            _hist.create(1, levels.cols - 1, CV_32S);
+            GpuMat hist = _hist.getGpuMat();
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, levels.cols, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
+            GpuMat buf = _buf.getGpuMat();
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, hist.ptr<Npp32s>(), levels.ptr<level_t>(), levels.cols, buf.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppHistogramRangeFuncC4<SDEPTH>::func_ptr func, get_buf_size_c4_t get_buf_size>
+    struct NppHistogramRangeC4
+    {
+        typedef typename NppHistogramRangeFuncC4<SDEPTH>::src_t src_t;
+        typedef typename NppHistogramRangeFuncC1<SDEPTH>::level_t level_t;
+        enum {LEVEL_TYPE_CODE=NppHistogramRangeFuncC1<SDEPTH>::LEVEL_TYPE_CODE};
+
+        static void hist(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4],InputOutputArray _buf, cudaStream_t stream)
+        {
+            CV_Assert( levels[0].type() == LEVEL_TYPE_CODE && levels[0].rows == 1 );
+            CV_Assert( levels[1].type() == LEVEL_TYPE_CODE && levels[1].rows == 1 );
+            CV_Assert( levels[2].type() == LEVEL_TYPE_CODE && levels[2].rows == 1 );
+            CV_Assert( levels[3].type() == LEVEL_TYPE_CODE && levels[3].rows == 1 );
+
+            hist[0].create(1, levels[0].cols - 1, CV_32S);
+            hist[1].create(1, levels[1].cols - 1, CV_32S);
+            hist[2].create(1, levels[2].cols - 1, CV_32S);
+            hist[3].create(1, levels[3].cols - 1, CV_32S);
+
+            Npp32s* pHist[] = {hist[0].ptr<Npp32s>(), hist[1].ptr<Npp32s>(), hist[2].ptr<Npp32s>(), hist[3].ptr<Npp32s>()};
+            int nLevels[] = {levels[0].cols, levels[1].cols, levels[2].cols, levels[3].cols};
+            const level_t* pLevels[] = {levels[0].ptr<level_t>(), levels[1].ptr<level_t>(), levels[2].ptr<level_t>(), levels[3].ptr<level_t>()};
+
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            int buf_size;
+            get_buf_size(sz, nLevels, &buf_size);
+
+            ensureSizeIsEnough(1, buf_size, CV_8U, _buf);
+            GpuMat buf = _buf.getGpuMat();
+
+            NppStreamHandler h(stream);
+
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), sz, pHist, pLevels, nLevels, buf.ptr<Npp8u>()) );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+}
+
+void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int upperLevel)
+{
+    const int kind = _levels.kind();
+
+    _levels.create(1, nLevels, CV_32SC1);
+
+    Mat host_levels;
+    if (kind == _InputArray::GPU_MAT)
+        host_levels.create(1, nLevels, CV_32SC1);
+    else
+        host_levels = _levels.getMat();
+
+    nppSafeCall( nppiEvenLevelsHost_32s(host_levels.ptr<Npp32s>(), nLevels, lowerLevel, upperLevel) );
+
+    if (kind == _InputArray::GPU_MAT)
+        _levels.getGpuMatRef().upload(host_levels);
+}
+
+namespace hist
+{
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+}
+
+namespace
+{
+    void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
+    {
+        hist.create(1, histSize, CV_32S);
+        cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+    }
+}
+
+void cv::cuda::histEven(InputArray _src, OutputArray hist, InputOutputArray buf, int histSize, int lowerLevel, int upperLevel, Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, InputOutputArray buf, int levels, int lowerLevel, int upperLevel, cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramEvenC1<CV_8U , nppiHistogramEven_8u_C1R , nppiHistogramEvenGetBufferSize_8u_C1R >::hist,
+        0,
+        NppHistogramEvenC1<CV_16U, nppiHistogramEven_16u_C1R, nppiHistogramEvenGetBufferSize_16u_C1R>::hist,
+        NppHistogramEvenC1<CV_16S, nppiHistogramEven_16s_C1R, nppiHistogramEvenGetBufferSize_16s_C1R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    if (src.depth() == CV_8U && deviceSupports(FEATURE_SET_COMPUTE_30))
+    {
+        histEven8u(src, hist.getGpuMatRef(), histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+        return;
+    }
+
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 );
+
+    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+}
+
+void cv::cuda::histEven(InputArray _src, GpuMat hist[4], InputOutputArray buf, int histSize[4], int lowerLevel[4], int upperLevel[4], Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], InputOutputArray buf, int levels[4], int lowerLevel[4], int upperLevel[4], cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramEvenC4<CV_8U , nppiHistogramEven_8u_C4R , nppiHistogramEvenGetBufferSize_8u_C4R >::hist,
+        0,
+        NppHistogramEvenC4<CV_16U, nppiHistogramEven_16u_C4R, nppiHistogramEvenGetBufferSize_16u_C4R>::hist,
+        NppHistogramEvenC4<CV_16S, nppiHistogramEven_16s_C4R, nppiHistogramEvenGetBufferSize_16s_C4R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 );
+
+    hist_callers[src.depth()](src, hist, buf, histSize, lowerLevel, upperLevel, StreamAccessor::getStream(stream));
+}
+
+void cv::cuda::histRange(InputArray _src, OutputArray hist, InputArray _levels, InputOutputArray buf, Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, OutputArray hist, const GpuMat& levels, InputOutputArray buf, cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramRangeC1<CV_8U , nppiHistogramRange_8u_C1R , nppiHistogramRangeGetBufferSize_8u_C1R >::hist,
+        0,
+        NppHistogramRangeC1<CV_16U, nppiHistogramRange_16u_C1R, nppiHistogramRangeGetBufferSize_16u_C1R>::hist,
+        NppHistogramRangeC1<CV_16S, nppiHistogramRange_16s_C1R, nppiHistogramRangeGetBufferSize_16s_C1R>::hist,
+        0,
+        NppHistogramRangeC1<CV_32F, nppiHistogramRange_32f_C1R, nppiHistogramRangeGetBufferSize_32f_C1R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+    GpuMat levels = _levels.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1 );
+
+    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+}
+
+void cv::cuda::histRange(InputArray _src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, Stream& stream)
+{
+    typedef void (*hist_t)(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], InputOutputArray buf, cudaStream_t stream);
+    static const hist_t hist_callers[] =
+    {
+        NppHistogramRangeC4<CV_8U , nppiHistogramRange_8u_C4R , nppiHistogramRangeGetBufferSize_8u_C4R >::hist,
+        0,
+        NppHistogramRangeC4<CV_16U, nppiHistogramRange_16u_C4R, nppiHistogramRangeGetBufferSize_16u_C4R>::hist,
+        NppHistogramRangeC4<CV_16S, nppiHistogramRange_16s_C4R, nppiHistogramRangeGetBufferSize_16s_C4R>::hist,
+        0,
+        NppHistogramRangeC4<CV_32F, nppiHistogramRange_32f_C4R, nppiHistogramRangeGetBufferSize_32f_C4R>::hist
+    };
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 || src.type() == CV_16UC4 || src.type() == CV_16SC4 || src.type() == CV_32FC4 );
+
+    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/hough_circles.cpp
+++ b/modules/cudaimgproc/src/hough_circles.cpp
@@ -0,0 +1,297 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_CUDAFILTERS)
+
+Ptr<cuda::HoughCirclesDetector> cv::cuda::createHoughCirclesDetector(float, float, int, int, int, int, int) { throw_no_cuda(); return Ptr<HoughCirclesDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+
+    namespace hough_circles
+    {
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp);
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold);
+        int circlesAccumRadius_gpu(const unsigned int* centers, int centersCount, const unsigned int* list, int count,
+                                   float3* circles, int maxCircles, float dp, int minRadius, int maxRadius, int threshold, bool has20);
+    }
+}}}
+
+namespace
+{
+    class HoughCirclesDetectorImpl : public HoughCirclesDetector
+    {
+    public:
+        HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles);
+
+        void detect(InputArray src, OutputArray circles);
+
+        void setDp(float dp) { dp_ = dp; }
+        float getDp() const { return dp_; }
+
+        void setMinDist(float minDist) { minDist_ = minDist; }
+        float getMinDist() const { return minDist_; }
+
+        void setCannyThreshold(int cannyThreshold) { cannyThreshold_ = cannyThreshold; }
+        int getCannyThreshold() const { return cannyThreshold_; }
+
+        void setVotesThreshold(int votesThreshold) { votesThreshold_ = votesThreshold; }
+        int getVotesThreshold() const { return votesThreshold_; }
+
+        void setMinRadius(int minRadius) { minRadius_ = minRadius; }
+        int getMinRadius() const { return minRadius_; }
+
+        void setMaxRadius(int maxRadius) { maxRadius_ = maxRadius; }
+        int getMaxRadius() const { return maxRadius_; }
+
+        void setMaxCircles(int maxCircles) { maxCircles_ = maxCircles; }
+        int getMaxCircles() const { return maxCircles_; }
+
+        void write(FileStorage& fs) const
+        {
+            fs << "name" << "HoughCirclesDetector_GPU"
+            << "dp" << dp_
+            << "minDist" << minDist_
+            << "cannyThreshold" << cannyThreshold_
+            << "votesThreshold" << votesThreshold_
+            << "minRadius" << minRadius_
+            << "maxRadius" << maxRadius_
+            << "maxCircles" << maxCircles_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "HoughCirclesDetector_GPU" );
+            dp_ = (float)fn["dp"];
+            minDist_ = (float)fn["minDist"];
+            cannyThreshold_ = (int)fn["cannyThreshold"];
+            votesThreshold_ = (int)fn["votesThreshold"];
+            minRadius_ = (int)fn["minRadius"];
+            maxRadius_ = (int)fn["maxRadius"];
+            maxCircles_ = (int)fn["maxCircles"];
+        }
+
+    private:
+        float dp_;
+        float minDist_;
+        int cannyThreshold_;
+        int votesThreshold_;
+        int minRadius_;
+        int maxRadius_;
+        int maxCircles_;
+
+        GpuMat dx_, dy_;
+        GpuMat edges_;
+        GpuMat accum_;
+        GpuMat list_;
+        GpuMat result_;
+        Ptr<cuda::Filter> filterDx_;
+        Ptr<cuda::Filter> filterDy_;
+        Ptr<cuda::CannyEdgeDetector> canny_;
+    };
+
+    HoughCirclesDetectorImpl::HoughCirclesDetectorImpl(float dp, float minDist, int cannyThreshold, int votesThreshold,
+                                                       int minRadius, int maxRadius, int maxCircles) :
+        dp_(dp), minDist_(minDist), cannyThreshold_(cannyThreshold), votesThreshold_(votesThreshold),
+        minRadius_(minRadius), maxRadius_(maxRadius), maxCircles_(maxCircles)
+    {
+        canny_ = cuda::createCannyEdgeDetector(std::max(cannyThreshold_ / 2, 1), cannyThreshold_);
+
+        filterDx_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
+        filterDy_ = cuda::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
+    }
+
+    void HoughCirclesDetectorImpl::detect(InputArray _src, OutputArray circles)
+    {
+        using namespace cv::cuda::device::hough;
+        using namespace cv::cuda::device::hough_circles;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( dp_ > 0 );
+        CV_Assert( minRadius_ > 0 && maxRadius_ > minRadius_ );
+        CV_Assert( cannyThreshold_ > 0 );
+        CV_Assert( votesThreshold_ > 0 );
+        CV_Assert( maxCircles_ > 0 );
+
+        const float idp = 1.0f / dp_;
+
+        filterDx_->apply(src, dx_);
+        filterDy_->apply(src, dy_);
+
+        canny_->setLowThreshold(std::max(cannyThreshold_ / 2, 1));
+        canny_->setHighThreshold(cannyThreshold_);
+
+        canny_->detect(dx_, dy_, edges_);
+
+        ensureSizeIsEnough(2, src.size().area(), CV_32SC1, list_);
+        unsigned int* srcPoints = list_.ptr<unsigned int>(0);
+        unsigned int* centers = list_.ptr<unsigned int>(1);
+
+        const int pointsCount = buildPointList_gpu(edges_, srcPoints);
+        if (pointsCount == 0)
+        {
+            circles.release();
+            return;
+        }
+
+        ensureSizeIsEnough(cvCeil(src.rows * idp) + 2, cvCeil(src.cols * idp) + 2, CV_32SC1, accum_);
+        accum_.setTo(Scalar::all(0));
+
+        circlesAccumCenters_gpu(srcPoints, pointsCount, dx_, dy_, accum_, minRadius_, maxRadius_, idp);
+
+        int centersCount = buildCentersList_gpu(accum_, centers, votesThreshold_);
+        if (centersCount == 0)
+        {
+            circles.release();
+            return;
+        }
+
+        if (minDist_ > 1)
+        {
+            AutoBuffer<ushort2> oldBuf_(centersCount);
+            AutoBuffer<ushort2> newBuf_(centersCount);
+            int newCount = 0;
+
+            ushort2* oldBuf = oldBuf_;
+            ushort2* newBuf = newBuf_;
+
+            cudaSafeCall( cudaMemcpy(oldBuf, centers, centersCount * sizeof(ushort2), cudaMemcpyDeviceToHost) );
+
+            const int cellSize = cvRound(minDist_);
+            const int gridWidth = (src.cols + cellSize - 1) / cellSize;
+            const int gridHeight = (src.rows + cellSize - 1) / cellSize;
+
+            std::vector< std::vector<ushort2> > grid(gridWidth * gridHeight);
+
+            const float minDist2 = minDist_ * minDist_;
+
+            for (int i = 0; i < centersCount; ++i)
+            {
+                ushort2 p = oldBuf[i];
+
+                bool good = true;
+
+                int xCell = static_cast<int>(p.x / cellSize);
+                int yCell = static_cast<int>(p.y / cellSize);
+
+                int x1 = xCell - 1;
+                int y1 = yCell - 1;
+                int x2 = xCell + 1;
+                int y2 = yCell + 1;
+
+                // boundary check
+                x1 = std::max(0, x1);
+                y1 = std::max(0, y1);
+                x2 = std::min(gridWidth - 1, x2);
+                y2 = std::min(gridHeight - 1, y2);
+
+                for (int yy = y1; yy <= y2; ++yy)
+                {
+                    for (int xx = x1; xx <= x2; ++xx)
+                    {
+                        std::vector<ushort2>& m = grid[yy * gridWidth + xx];
+
+                        for(size_t j = 0; j < m.size(); ++j)
+                        {
+                            float dx = (float)(p.x - m[j].x);
+                            float dy = (float)(p.y - m[j].y);
+
+                            if (dx * dx + dy * dy < minDist2)
+                            {
+                                good = false;
+                                goto break_out;
+                            }
+                        }
+                    }
+                }
+
+                break_out:
+
+                if(good)
+                {
+                    grid[yCell * gridWidth + xCell].push_back(p);
+
+                    newBuf[newCount++] = p;
+                }
+            }
+
+            cudaSafeCall( cudaMemcpy(centers, newBuf, newCount * sizeof(unsigned int), cudaMemcpyHostToDevice) );
+            centersCount = newCount;
+        }
+
+        ensureSizeIsEnough(1, maxCircles_, CV_32FC3, result_);
+
+        int circlesCount = circlesAccumRadius_gpu(centers, centersCount, srcPoints, pointsCount, result_.ptr<float3>(), maxCircles_,
+                                                  dp_, minRadius_, maxRadius_, votesThreshold_, deviceSupports(FEATURE_SET_COMPUTE_20));
+
+        if (circlesCount == 0)
+        {
+            circles.release();
+            return;
+        }
+
+        result_.cols = circlesCount;
+        result_.copyTo(circles);
+    }
+}
+
+Ptr<HoughCirclesDetector> cv::cuda::createHoughCirclesDetector(float dp, float minDist, int cannyThreshold, int votesThreshold, int minRadius, int maxRadius, int maxCircles)
+{
+    return new HoughCirclesDetectorImpl(dp, minDist, cannyThreshold, votesThreshold, minRadius, maxRadius, maxCircles);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/hough_lines.cpp
+++ b/modules/cudaimgproc/src/hough_lines.cpp
@@ -0,0 +1,202 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::HoughLinesDetector> cv::cuda::createHoughLinesDetector(float, float, int, bool, int) { throw_no_cuda(); return Ptr<HoughLinesDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+
+    namespace hough_lines
+    {
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort);
+    }
+}}}
+
+namespace
+{
+    class HoughLinesDetectorImpl : public HoughLinesDetector
+    {
+    public:
+        HoughLinesDetectorImpl(float rho, float theta, int threshold, bool doSort, int maxLines) :
+            rho_(rho), theta_(theta), threshold_(threshold), doSort_(doSort), maxLines_(maxLines)
+        {
+        }
+
+        void detect(InputArray src, OutputArray lines);
+        void downloadResults(InputArray d_lines, OutputArray h_lines, OutputArray h_votes = noArray());
+
+        void setRho(float rho) { rho_ = rho; }
+        float getRho() const { return rho_; }
+
+        void setTheta(float theta) { theta_ = theta; }
+        float getTheta() const { return theta_; }
+
+        void setThreshold(int threshold) { threshold_ = threshold; }
+        int getThreshold() const { return threshold_; }
+
+        void setDoSort(bool doSort) { doSort_ = doSort; }
+        bool getDoSort() const { return doSort_; }
+
+        void setMaxLines(int maxLines) { maxLines_ = maxLines; }
+        int getMaxLines() const { return maxLines_; }
+
+        void write(FileStorage& fs) const
+        {
+            fs << "name" << "HoughLinesDetector_GPU"
+            << "rho" << rho_
+            << "theta" << theta_
+            << "threshold" << threshold_
+            << "doSort" << doSort_
+            << "maxLines" << maxLines_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "HoughLinesDetector_GPU" );
+            rho_ = (float)fn["rho"];
+            theta_ = (float)fn["theta"];
+            threshold_ = (int)fn["threshold"];
+            doSort_ = (int)fn["doSort"] != 0;
+            maxLines_ = (int)fn["maxLines"];
+        }
+
+    private:
+        float rho_;
+        float theta_;
+        int threshold_;
+        bool doSort_;
+        int maxLines_;
+
+        GpuMat accum_;
+        GpuMat list_;
+        GpuMat result_;
+    };
+
+    void HoughLinesDetectorImpl::detect(InputArray _src, OutputArray lines)
+    {
+        using namespace cv::cuda::device::hough;
+        using namespace cv::cuda::device::hough_lines;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+        ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
+        unsigned int* srcPoints = list_.ptr<unsigned int>();
+
+        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        if (pointsCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        const int numangle = cvRound(CV_PI / theta_);
+        const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho_);
+        CV_Assert( numangle > 0 && numrho > 0 );
+
+        ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
+        accum_.setTo(Scalar::all(0));
+
+        DeviceInfo devInfo;
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+        ensureSizeIsEnough(2, maxLines_, CV_32FC2, result_);
+
+        int linesCount = linesGetResult_gpu(accum_, result_.ptr<float2>(0), result_.ptr<int>(1), maxLines_, rho_, theta_, threshold_, doSort_);
+
+        if (linesCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        result_.cols = linesCount;
+        result_.copyTo(lines);
+    }
+
+    void HoughLinesDetectorImpl::downloadResults(InputArray _d_lines, OutputArray h_lines, OutputArray h_votes)
+    {
+        GpuMat d_lines = _d_lines.getGpuMat();
+
+        if (d_lines.empty())
+        {
+            h_lines.release();
+            if (h_votes.needed())
+                h_votes.release();
+            return;
+        }
+
+        CV_Assert( d_lines.rows == 2 && d_lines.type() == CV_32FC2 );
+
+        d_lines.row(0).download(h_lines);
+
+        if (h_votes.needed())
+        {
+            GpuMat d_votes(1, d_lines.cols, CV_32SC1, d_lines.ptr<int>(1));
+            d_votes.download(h_votes);
+        }
+    }
+}
+
+Ptr<HoughLinesDetector> cv::cuda::createHoughLinesDetector(float rho, float theta, int threshold, bool doSort, int maxLines)
+{
+    return new HoughLinesDetectorImpl(rho, theta, threshold, doSort, maxLines);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/hough_segments.cpp
+++ b/modules/cudaimgproc/src/hough_segments.cpp
@@ -0,0 +1,183 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+Ptr<cuda::HoughSegmentDetector> cv::cuda::createHoughSegmentDetector(float, float, int, int, int) { throw_no_cuda(); return Ptr<HoughSegmentDetector>(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace hough
+    {
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list);
+    }
+
+    namespace hough_lines
+    {
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20);
+    }
+
+    namespace hough_segments
+    {
+        int houghLinesProbabilistic_gpu(PtrStepSzb mask, PtrStepSzi accum, int4* out, int maxSize, float rho, float theta, int lineGap, int lineLength);
+    }
+}}}
+
+namespace
+{
+    class HoughSegmentDetectorImpl : public HoughSegmentDetector
+    {
+    public:
+        HoughSegmentDetectorImpl(float rho, float theta, int minLineLength, int maxLineGap, int maxLines) :
+            rho_(rho), theta_(theta), minLineLength_(minLineLength), maxLineGap_(maxLineGap), maxLines_(maxLines)
+        {
+        }
+
+        void detect(InputArray src, OutputArray lines);
+
+        void setRho(float rho) { rho_ = rho; }
+        float getRho() const { return rho_; }
+
+        void setTheta(float theta) { theta_ = theta; }
+        float getTheta() const { return theta_; }
+
+        void setMinLineLength(int minLineLength) { minLineLength_ = minLineLength; }
+        int getMinLineLength() const { return minLineLength_; }
+
+        void setMaxLineGap(int maxLineGap) { maxLineGap_ = maxLineGap; }
+        int getMaxLineGap() const { return maxLineGap_; }
+
+        void setMaxLines(int maxLines) { maxLines_ = maxLines; }
+        int getMaxLines() const { return maxLines_; }
+
+        void write(FileStorage& fs) const
+        {
+            fs << "name" << "PHoughLinesDetector_GPU"
+            << "rho" << rho_
+            << "theta" << theta_
+            << "minLineLength" << minLineLength_
+            << "maxLineGap" << maxLineGap_
+            << "maxLines" << maxLines_;
+        }
+
+        void read(const FileNode& fn)
+        {
+            CV_Assert( String(fn["name"]) == "PHoughLinesDetector_GPU" );
+            rho_ = (float)fn["rho"];
+            theta_ = (float)fn["theta"];
+            minLineLength_ = (int)fn["minLineLength"];
+            maxLineGap_ = (int)fn["maxLineGap"];
+            maxLines_ = (int)fn["maxLines"];
+        }
+
+    private:
+        float rho_;
+        float theta_;
+        int minLineLength_;
+        int maxLineGap_;
+        int maxLines_;
+
+        GpuMat accum_;
+        GpuMat list_;
+        GpuMat result_;
+    };
+
+    void HoughSegmentDetectorImpl::detect(InputArray _src, OutputArray lines)
+    {
+        using namespace cv::cuda::device::hough;
+        using namespace cv::cuda::device::hough_lines;
+        using namespace cv::cuda::device::hough_segments;
+
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( src.cols < std::numeric_limits<unsigned short>::max() );
+        CV_Assert( src.rows < std::numeric_limits<unsigned short>::max() );
+
+        ensureSizeIsEnough(1, src.size().area(), CV_32SC1, list_);
+        unsigned int* srcPoints = list_.ptr<unsigned int>();
+
+        const int pointsCount = buildPointList_gpu(src, srcPoints);
+        if (pointsCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        const int numangle = cvRound(CV_PI / theta_);
+        const int numrho = cvRound(((src.cols + src.rows) * 2 + 1) / rho_);
+        CV_Assert( numangle > 0 && numrho > 0 );
+
+        ensureSizeIsEnough(numangle + 2, numrho + 2, CV_32SC1, accum_);
+        accum_.setTo(Scalar::all(0));
+
+        DeviceInfo devInfo;
+        linesAccum_gpu(srcPoints, pointsCount, accum_, rho_, theta_, devInfo.sharedMemPerBlock(), devInfo.supports(FEATURE_SET_COMPUTE_20));
+
+        ensureSizeIsEnough(1, maxLines_, CV_32SC4, result_);
+
+        int linesCount = houghLinesProbabilistic_gpu(src, accum_, result_.ptr<int4>(), maxLines_, rho_, theta_, maxLineGap_, minLineLength_);
+
+        if (linesCount == 0)
+        {
+            lines.release();
+            return;
+        }
+
+        result_.cols = linesCount;
+        result_.copyTo(lines);
+    }
+}
+
+Ptr<HoughSegmentDetector> cv::cuda::createHoughSegmentDetector(float rho, float theta, int minLineLength, int maxLineGap, int maxLines)
+{
+    return new HoughSegmentDetectorImpl(rho, theta, minLineLength, maxLineGap, maxLines);
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/match_template.cpp
+++ b/modules/cudaimgproc/src/match_template.cpp
@@ -0,0 +1,649 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDAARITHM) || defined (CUDA_DISABLER)
+
+Ptr<cuda::TemplateMatching> cv::cuda::createTemplateMatching(int, int, Size) { throw_no_cuda(); return Ptr<cuda::TemplateMatching>(); }
+
+#else
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace match_template
+    {
+        void matchTemplateNaive_CCORR_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_CCORR_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void matchTemplateNaive_SQDIFF_8U(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+        void matchTemplateNaive_SQDIFF_32F(const PtrStepSzb image, const PtrStepSzb templ, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_SQDIFF_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+            int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_SQDIFF_NORMED_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum, unsigned long long templ_sqsum, PtrStepSzf result,
+            int cn, cudaStream_t stream);
+
+        void matchTemplatePrepared_CCOFF_8U(int w, int h, const PtrStepSz<unsigned int> image_sum, unsigned int templ_sum, PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC2(
+            int w, int h,
+            const PtrStepSz<unsigned int> image_sum_r,
+            const PtrStepSz<unsigned int> image_sum_g,
+            unsigned int templ_sum_r,
+            unsigned int templ_sum_g,
+            PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r,
+                const PtrStepSz<unsigned int> image_sum_g,
+                const PtrStepSz<unsigned int> image_sum_b,
+                const PtrStepSz<unsigned int> image_sum_a,
+                unsigned int templ_sum_r,
+                unsigned int templ_sum_g,
+                unsigned int templ_sum_b,
+                unsigned int templ_sum_a,
+                PtrStepSzf result, cudaStream_t stream);
+
+
+        void matchTemplatePrepared_CCOFF_NORMED_8U(
+                int w, int h, const PtrStepSz<unsigned int> image_sum,
+                const PtrStepSz<unsigned long long> image_sqsum,
+                unsigned int templ_sum, unsigned long long templ_sqsum,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                PtrStepSzf result, cudaStream_t stream);
+        void matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                int w, int h,
+                const PtrStepSz<unsigned int> image_sum_r, const PtrStepSz<unsigned long long> image_sqsum_r,
+                const PtrStepSz<unsigned int> image_sum_g, const PtrStepSz<unsigned long long> image_sqsum_g,
+                const PtrStepSz<unsigned int> image_sum_b, const PtrStepSz<unsigned long long> image_sqsum_b,
+                const PtrStepSz<unsigned int> image_sum_a, const PtrStepSz<unsigned long long> image_sqsum_a,
+                unsigned int templ_sum_r, unsigned long long templ_sqsum_r,
+                unsigned int templ_sum_g, unsigned long long templ_sqsum_g,
+                unsigned int templ_sum_b, unsigned long long templ_sqsum_b,
+                unsigned int templ_sum_a, unsigned long long templ_sqsum_a,
+                PtrStepSzf result, cudaStream_t stream);
+
+        void normalize_8U(int w, int h, const PtrStepSz<unsigned long long> image_sqsum,
+                          unsigned long long templ_sqsum, PtrStepSzf result, int cn, cudaStream_t stream);
+
+        void extractFirstChannel_32F(const PtrStepSzb image, PtrStepSzf result, int cn, cudaStream_t stream);
+    }
+}}}
+
+namespace
+{
+    // Evaluates optimal template's area threshold. If
+    // template's area is less  than the threshold, we use naive match
+    // template version, otherwise FFT-based (if available)
+    int getTemplateThreshold(int method, int depth)
+    {
+        switch (method)
+        {
+        case TM_CCORR:
+            if (depth == CV_32F) return 250;
+            if (depth == CV_8U) return 300;
+            break;
+
+        case TM_SQDIFF:
+            if (depth == CV_8U) return 300;
+            break;
+        }
+
+        CV_Error(Error::StsBadArg, "unsupported match template mode");
+        return 0;
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCORR_32F
+
+    class Match_CCORR_32F : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_32F(Size user_block_size);
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        Ptr<cuda::Convolution> conv_;
+        GpuMat result_;
+    };
+
+    Match_CCORR_32F::Match_CCORR_32F(Size user_block_size)
+    {
+        conv_ = cuda::createConvolution(user_block_size);
+    }
+
+    void Match_CCORR_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& _stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        cudaStream_t stream = StreamAccessor::getStream(_stream);
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_32F))
+        {
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), stream);
+            return;
+        }
+
+        if (image.channels() == 1)
+        {
+            conv_->convolve(image.reshape(1), templ.reshape(1), result, true, _stream);
+        }
+        else
+        {
+            conv_->convolve(image.reshape(1), templ.reshape(1), result_, true, _stream);
+            extractFirstChannel_32F(result_, result, image.channels(), stream);
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCORR_8U
+
+    class Match_CCORR_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_8U(Size user_block_size) : match32F_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match32F_;
+    };
+
+    void Match_CCORR_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_CCORR, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
+            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
+
+        match32F_.match(imagef_, templf_, _result, stream);
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCORR_NORMED_8U
+
+    class Match_CCORR_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCORR_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        Match_CCORR_8U match_CCORR_;
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+    };
+
+    void Match_CCORR_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+
+        normalize_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_32F
+
+    class Match_SQDIFF_32F : public TemplateMatching
+    {
+    public:
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+    };
+
+    void Match_SQDIFF_32F::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_32F );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_8U
+
+    class Match_SQDIFF_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_SQDIFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        if (templ.size().area() < getTemplateThreshold(TM_SQDIFF, CV_8U))
+        {
+            _result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32FC1);
+            GpuMat result = _result.getGpuMat();
+
+            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            return;
+        }
+
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // SQDIFF_NORMED_8U
+
+    class Match_SQDIFF_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_SQDIFF_NORMED_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat image_sqsums_;
+        GpuMat intBuffer_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_SQDIFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        cuda::sqrIntegral(image.reshape(1), image_sqsums_, intBuffer_, stream);
+
+        unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ.reshape(1))[0];
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, image_sqsums_, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCOFF_8U
+
+    class Match_CCOEFF_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCOEFF_8U(Size user_block_size) : match_CCORR_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat intBuffer_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        Match_CCORR_8U match_CCORR_;
+    };
+
+    void Match_CCOEFF_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        match_CCORR_.match(image, templ, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        if (image.channels() == 1)
+        {
+            image_sums_.resize(1);
+            cuda::integral(image, image_sums_[0], intBuffer_, stream);
+
+            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
+
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sums_[0], templ_sum, result, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+            cuda::split(image, images_);
+
+            image_sums_.resize(images_.size());
+            for (int i = 0; i < image.channels(); ++i)
+                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
+
+            Scalar templ_sum = cuda::sum(templ);
+
+            switch (image.channels())
+            {
+            case 2:
+                matchTemplatePrepared_CCOFF_8UC2(
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 3:
+                matchTemplatePrepared_CCOFF_8UC3(
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 4:
+                matchTemplatePrepared_CCOFF_8UC4(
+                        templ.cols, templ.rows, image_sums_[0], image_sums_[1], image_sums_[2], image_sums_[3],
+                        (unsigned int) templ_sum[0], (unsigned int) templ_sum[1], (unsigned int) templ_sum[2], (unsigned int) templ_sum[3],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            default:
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
+            }
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////
+    // CCOFF_NORMED_8U
+
+    class Match_CCOEFF_NORMED_8U : public TemplateMatching
+    {
+    public:
+        explicit Match_CCOEFF_NORMED_8U(Size user_block_size) : match_CCORR_32F_(user_block_size)
+        {
+        }
+
+        void match(InputArray image, InputArray templ, OutputArray result, Stream& stream = Stream::Null());
+
+    private:
+        GpuMat imagef_, templf_;
+        Match_CCORR_32F match_CCORR_32F_;
+        GpuMat intBuffer_;
+        std::vector<GpuMat> images_;
+        std::vector<GpuMat> image_sums_;
+        std::vector<GpuMat> image_sqsums_;
+    };
+
+    void Match_CCOEFF_NORMED_8U::match(InputArray _image, InputArray _templ, OutputArray _result, Stream& stream)
+    {
+        using namespace cv::cuda::device::match_template;
+
+        GpuMat image = _image.getGpuMat();
+        GpuMat templ = _templ.getGpuMat();
+
+        CV_Assert( image.depth() == CV_8U );
+        CV_Assert( image.type() == templ.type() );
+        CV_Assert( image.cols >= templ.cols && image.rows >= templ.rows );
+
+        image.convertTo(imagef_, CV_32F, stream);
+        templ.convertTo(templf_, CV_32F, stream);
+
+        match_CCORR_32F_.match(imagef_, templf_, _result, stream);
+        GpuMat result = _result.getGpuMat();
+
+        if (image.channels() == 1)
+        {
+            image_sums_.resize(1);
+            cuda::integral(image, image_sums_[0], intBuffer_, stream);
+
+            image_sqsums_.resize(1);
+            cuda::sqrIntegral(image, image_sqsums_[0], intBuffer_, stream);
+
+            unsigned int templ_sum = (unsigned int) cuda::sum(templ)[0];
+            unsigned long long templ_sqsum = (unsigned long long) cuda::sqrSum(templ)[0];
+
+            matchTemplatePrepared_CCOFF_NORMED_8U(
+                    templ.cols, templ.rows, image_sums_[0], image_sqsums_[0],
+                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
+        }
+        else
+        {
+            cuda::split(image, images_);
+
+            image_sums_.resize(images_.size());
+            image_sqsums_.resize(images_.size());
+            for (int i = 0; i < image.channels(); ++i)
+            {
+                cuda::integral(images_[i], image_sums_[i], intBuffer_, stream);
+                cuda::sqrIntegral(images_[i], image_sqsums_[i], intBuffer_, stream);
+            }
+
+            Scalar templ_sum = cuda::sum(templ);
+            Scalar templ_sqsum = cuda::sqrSum(templ);
+
+            switch (image.channels())
+            {
+            case 2:
+                matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                        templ.cols, templ.rows,
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
+                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 3:
+                matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                        templ.cols, templ.rows,
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
+                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
+                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            case 4:
+                matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                        templ.cols, templ.rows,
+                        image_sums_[0], image_sqsums_[0],
+                        image_sums_[1], image_sqsums_[1],
+                        image_sums_[2], image_sqsums_[2],
+                        image_sums_[3], image_sqsums_[3],
+                        (unsigned int)templ_sum[0], (unsigned long long)templ_sqsum[0],
+                        (unsigned int)templ_sum[1], (unsigned long long)templ_sqsum[1],
+                        (unsigned int)templ_sum[2], (unsigned long long)templ_sqsum[2],
+                        (unsigned int)templ_sum[3], (unsigned long long)templ_sqsum[3],
+                        result, StreamAccessor::getStream(stream));
+                break;
+            default:
+                CV_Error(Error::StsBadArg, "unsupported number of channels");
+            }
+        }
+    }
+}
+
+Ptr<cuda::TemplateMatching> cv::cuda::createTemplateMatching(int srcType, int method, Size user_block_size)
+{
+    const int sdepth = CV_MAT_DEPTH(srcType);
+
+    CV_Assert( sdepth == CV_8U || sdepth == CV_32F );
+
+    if (sdepth == CV_32F)
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return new Match_SQDIFF_32F;
+
+        case TM_CCORR:
+            return new Match_CCORR_32F(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<cuda::TemplateMatching>();
+        }
+    }
+    else
+    {
+        switch (method)
+        {
+        case TM_SQDIFF:
+            return new Match_SQDIFF_8U(user_block_size);
+
+        case TM_SQDIFF_NORMED:
+            return new Match_SQDIFF_NORMED_8U(user_block_size);
+
+        case TM_CCORR:
+            return new Match_CCORR_8U(user_block_size);
+
+        case TM_CCORR_NORMED:
+            return new Match_CCORR_NORMED_8U(user_block_size);
+
+        case TM_CCOEFF:
+            return new Match_CCOEFF_8U(user_block_size);
+
+        case TM_CCOEFF_NORMED:
+            return new Match_CCOEFF_NORMED_8U(user_block_size);
+
+        default:
+            CV_Error( Error::StsBadFlag, "Unsopported method" );
+            return Ptr<cuda::TemplateMatching>();
+        }
+    }
+}
+
+#endif
--- a/modules/cudaimgproc/src/mean_shift.cpp
+++ b/modules/cudaimgproc/src/mean_shift.cpp
@@ -0,0 +1,128 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+using namespace cv;
+using namespace cv::cuda;
+
+#if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
+
+void cv::cuda::meanShiftFiltering(InputArray, OutputArray, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+void cv::cuda::meanShiftProc(InputArray, OutputArray, OutputArray, int, int, TermCriteria, Stream&) { throw_no_cuda(); }
+
+#else /* !defined (HAVE_CUDA) */
+
+////////////////////////////////////////////////////////////////////////
+// meanShiftFiltering
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::meanShiftFiltering(InputArray _src, OutputArray _dst, int sp, int sr, TermCriteria criteria, Stream& stream)
+{
+    using namespace ::cv::cuda::device::imgproc;
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 );
+
+    _dst.create(src.size(), CV_8UC4);
+    GpuMat dst = _dst.getGpuMat();
+
+    if (!(criteria.type & TermCriteria::MAX_ITER))
+        criteria.maxCount = 5;
+
+    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
+
+    if (!(criteria.type & TermCriteria::EPS))
+        criteria.epsilon = 1.f;
+
+    float eps = (float) std::max(criteria.epsilon, 0.0);
+
+    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+}
+
+////////////////////////////////////////////////////////////////////////
+// meanShiftProc_GPU
+
+namespace cv { namespace cuda { namespace device
+{
+    namespace imgproc
+    {
+        void meanShiftProc_gpu(const PtrStepSzb& src, PtrStepSzb dstr, PtrStepSzb dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
+    }
+}}}
+
+void cv::cuda::meanShiftProc(InputArray _src, OutputArray _dstr, OutputArray _dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
+{
+    using namespace ::cv::cuda::device::imgproc;
+
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 );
+
+    _dstr.create(src.size(), CV_8UC4);
+    _dstsp.create(src.size(), CV_16SC2);
+
+    GpuMat dstr = _dstr.getGpuMat();
+    GpuMat dstsp = _dstsp.getGpuMat();
+
+    if (!(criteria.type & TermCriteria::MAX_ITER))
+        criteria.maxCount = 5;
+
+    int maxIter = std::min(std::max(criteria.maxCount, 1), 100);
+
+    if (!(criteria.type & TermCriteria::EPS))
+        criteria.epsilon = 1.f;
+
+    float eps = (float) std::max(criteria.epsilon, 0.0);
+
+    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+}
+
+#endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/src/mssegmentation.cpp
+++ b/modules/cudaimgproc/src/mssegmentation.cpp
@@ -0,0 +1,391 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#include "precomp.hpp"
+
+#if !defined HAVE_CUDA || defined(CUDA_DISABLER)
+
+void cv::cuda::meanShiftSegmentation(InputArray, OutputArray, int, int, int, TermCriteria) { throw_no_cuda(); }
+
+#else
+
+// Auxiliray stuff
+namespace
+{
+
+//
+// Declarations
+//
+
+class DjSets
+{
+public:
+    DjSets(int n);
+    int find(int elem);
+    int merge(int set1, int set2);
+
+    std::vector<int> parent;
+    std::vector<int> rank;
+    std::vector<int> size;
+private:
+    DjSets(const DjSets&);
+    void operator =(const DjSets&);
+};
+
+
+template <typename T>
+struct GraphEdge
+{
+    GraphEdge() {}
+    GraphEdge(int to_, int next_, const T& val_) : to(to_), next(next_), val(val_) {}
+    int to;
+    int next;
+    T val;
+};
+
+
+template <typename T>
+class Graph
+{
+public:
+    typedef GraphEdge<T> Edge;
+
+    Graph(int numv, int nume_max);
+
+    void addEdge(int from, int to, const T& val=T());
+
+    std::vector<int> start;
+    std::vector<Edge> edges;
+
+    int numv;
+    int nume_max;
+    int nume;
+private:
+    Graph(const Graph&);
+    void operator =(const Graph&);
+};
+
+
+struct SegmLinkVal
+{
+    SegmLinkVal() {}
+    SegmLinkVal(int dr_, int dsp_) : dr(dr_), dsp(dsp_) {}
+    bool operator <(const SegmLinkVal& other) const
+    {
+        return dr + dsp < other.dr + other.dsp;
+    }
+    int dr;
+    int dsp;
+};
+
+
+struct SegmLink
+{
+    SegmLink() {}
+    SegmLink(int from_, int to_, const SegmLinkVal& val_)
+        : from(from_), to(to_), val(val_) {}
+    bool operator <(const SegmLink& other) const
+    {
+        return val < other.val;
+    }
+    int from;
+    int to;
+    SegmLinkVal val;
+};
+
+//
+// Implementation
+//
+
+DjSets::DjSets(int n) : parent(n), rank(n, 0), size(n, 1)
+{
+    for (int i = 0; i < n; ++i)
+        parent[i] = i;
+}
+
+
+inline int DjSets::find(int elem)
+{
+    int set = elem;
+    while (set != parent[set])
+        set = parent[set];
+    while (elem != parent[elem])
+    {
+        int next = parent[elem];
+        parent[elem] = set;
+        elem = next;
+    }
+    return set;
+}
+
+
+inline int DjSets::merge(int set1, int set2)
+{
+    if (rank[set1] < rank[set2])
+    {
+        parent[set1] = set2;
+        size[set2] += size[set1];
+        return set2;
+    }
+    if (rank[set2] < rank[set1])
+    {
+        parent[set2] = set1;
+        size[set1] += size[set2];
+        return set1;
+    }
+    parent[set1] = set2;
+    rank[set2]++;
+    size[set2] += size[set1];
+    return set2;
+}
+
+
+template <typename T>
+Graph<T>::Graph(int numv_, int nume_max_) : start(numv_, -1), edges(nume_max_)
+{
+    this->numv = numv_;
+    this->nume_max = nume_max_;
+    nume = 0;
+}
+
+
+template <typename T>
+inline void Graph<T>::addEdge(int from, int to, const T& val)
+{
+    edges[nume] = Edge(to, start[from], val);
+    start[from] = nume;
+    nume++;
+}
+
+
+inline int pix(int y, int x, int ncols)
+{
+    return y * ncols + x;
+}
+
+
+inline int sqr(int x)
+{
+    return x * x;
+}
+
+
+inline int dist2(const cv::Vec4b& lhs, const cv::Vec4b& rhs)
+{
+    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]) + sqr(lhs[2] - rhs[2]);
+}
+
+
+inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
+{
+    return sqr(lhs[0] - rhs[0]) + sqr(lhs[1] - rhs[1]);
+}
+
+} // anonymous namespace
+
+
+void cv::cuda::meanShiftSegmentation(InputArray _src, OutputArray _dst, int sp, int sr, int minsize, TermCriteria criteria)
+{
+    GpuMat src = _src.getGpuMat();
+
+    CV_Assert( src.type() == CV_8UC4 );
+
+    const int nrows = src.rows;
+    const int ncols = src.cols;
+    const int hr = sr;
+    const int hsp = sp;
+
+    // Perform mean shift procedure and obtain region and spatial maps
+    GpuMat d_rmap, d_spmap;
+    cuda::meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
+    Mat rmap(d_rmap);
+    Mat spmap(d_spmap);
+
+    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
+                                        + (nrows - 1) + (ncols - 1));
+
+    // Make region adjacent graph from image
+    Vec4b r1;
+    Vec4b r2[4];
+    Vec2s sp1;
+    Vec2s sp2[4];
+    int dr[4];
+    int dsp[4];
+    for (int y = 0; y < nrows - 1; ++y)
+    {
+        Vec4b* ry = rmap.ptr<Vec4b>(y);
+        Vec4b* ryp = rmap.ptr<Vec4b>(y + 1);
+        Vec2s* spy = spmap.ptr<Vec2s>(y);
+        Vec2s* spyp = spmap.ptr<Vec2s>(y + 1);
+        for (int x = 0; x < ncols - 1; ++x)
+        {
+            r1 = ry[x];
+            sp1 = spy[x];
+
+            r2[0] = ry[x + 1];
+            r2[1] = ryp[x];
+            r2[2] = ryp[x + 1];
+            r2[3] = ryp[x];
+
+            sp2[0] = spy[x + 1];
+            sp2[1] = spyp[x];
+            sp2[2] = spyp[x + 1];
+            sp2[3] = spyp[x];
+
+            dr[0] = dist2(r1, r2[0]);
+            dr[1] = dist2(r1, r2[1]);
+            dr[2] = dist2(r1, r2[2]);
+            dsp[0] = dist2(sp1, sp2[0]);
+            dsp[1] = dist2(sp1, sp2[1]);
+            dsp[2] = dist2(sp1, sp2[2]);
+
+            r1 = ry[x + 1];
+            sp1 = spy[x + 1];
+
+            dr[3] = dist2(r1, r2[3]);
+            dsp[3] = dist2(sp1, sp2[3]);
+
+            g.addEdge(pix(y, x, ncols), pix(y, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+            g.addEdge(pix(y, x, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[1], dsp[1]));
+            g.addEdge(pix(y, x, ncols), pix(y + 1, x + 1, ncols), SegmLinkVal(dr[2], dsp[2]));
+            g.addEdge(pix(y, x + 1, ncols), pix(y + 1, x, ncols), SegmLinkVal(dr[3], dsp[3]));
+        }
+    }
+    for (int y = 0; y < nrows - 1; ++y)
+    {
+        r1 = rmap.at<Vec4b>(y, ncols - 1);
+        r2[0] = rmap.at<Vec4b>(y + 1, ncols - 1);
+        sp1 = spmap.at<Vec2s>(y, ncols - 1);
+        sp2[0] = spmap.at<Vec2s>(y + 1, ncols - 1);
+        dr[0] = dist2(r1, r2[0]);
+        dsp[0] = dist2(sp1, sp2[0]);
+        g.addEdge(pix(y, ncols - 1, ncols), pix(y + 1, ncols - 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+    }
+    for (int x = 0; x < ncols - 1; ++x)
+    {
+        r1 = rmap.at<Vec4b>(nrows - 1, x);
+        r2[0] = rmap.at<Vec4b>(nrows - 1, x + 1);
+        sp1 = spmap.at<Vec2s>(nrows - 1, x);
+        sp2[0] = spmap.at<Vec2s>(nrows - 1, x + 1);
+        dr[0] = dist2(r1, r2[0]);
+        dsp[0] = dist2(sp1, sp2[0]);
+        g.addEdge(pix(nrows - 1, x, ncols), pix(nrows - 1, x + 1, ncols), SegmLinkVal(dr[0], dsp[0]));
+    }
+
+    DjSets comps(g.numv);
+
+    // Find adjacent components
+    for (int v = 0; v < g.numv; ++v)
+    {
+        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+        {
+            int c1 = comps.find(v);
+            int c2 = comps.find(g.edges[e_it].to);
+            if (c1 != c2 && g.edges[e_it].val.dr < hr && g.edges[e_it].val.dsp < hsp)
+                comps.merge(c1, c2);
+        }
+    }
+
+    std::vector<SegmLink> edges;
+    edges.reserve(g.numv);
+
+    // Prepare edges connecting differnet components
+    for (int v = 0; v < g.numv; ++v)
+    {
+        int c1 = comps.find(v);
+        for (int e_it = g.start[v]; e_it != -1; e_it = g.edges[e_it].next)
+        {
+            int c2 = comps.find(g.edges[e_it].to);
+            if (c1 != c2)
+                edges.push_back(SegmLink(c1, c2, g.edges[e_it].val));
+        }
+    }
+
+    // Sort all graph's edges connecting differnet components (in asceding order)
+    std::sort(edges.begin(), edges.end());
+
+    // Exclude small components (starting from the nearest couple)
+    for (size_t i = 0; i < edges.size(); ++i)
+    {
+        int c1 = comps.find(edges[i].from);
+        int c2 = comps.find(edges[i].to);
+        if (c1 != c2 && (comps.size[c1] < minsize || comps.size[c2] < minsize))
+            comps.merge(c1, c2);
+    }
+
+    // Compute sum of the pixel's colors which are in the same segment
+    Mat h_src(src);
+    std::vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
+    for (int y = 0; y < nrows; ++y)
+    {
+        Vec4b* h_srcy = h_src.ptr<Vec4b>(y);
+        for (int x = 0; x < ncols; ++x)
+        {
+            int parent = comps.find(pix(y, x, ncols));
+            Vec4b col = h_srcy[x];
+            Vec4i& sumcol = sumcols[parent];
+            sumcol[0] += col[0];
+            sumcol[1] += col[1];
+            sumcol[2] += col[2];
+        }
+    }
+
+    // Create final image, color of each segment is the average color of its pixels
+    _dst.create(src.size(), src.type());
+    Mat dst = _dst.getMat();
+
+    for (int y = 0; y < nrows; ++y)
+    {
+        Vec4b* dsty = dst.ptr<Vec4b>(y);
+        for (int x = 0; x < ncols; ++x)
+        {
+            int parent = comps.find(pix(y, x, ncols));
+            const Vec4i& sumcol = sumcols[parent];
+            Vec4b& dstcol = dsty[x];
+            dstcol[0] = static_cast<uchar>(sumcol[0] / comps.size[parent]);
+            dstcol[1] = static_cast<uchar>(sumcol[1] / comps.size[parent]);
+            dstcol[2] = static_cast<uchar>(sumcol[2] / comps.size[parent]);
+            dstcol[3] = 255;
+        }
+    }
+}
+
+#endif // #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
--- a/modules/cudaimgproc/src/precomp.cpp
+++ b/modules/cudaimgproc/src/precomp.cpp
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
--- a/modules/cudaimgproc/src/precomp.hpp
+++ b/modules/cudaimgproc/src/precomp.hpp
@@ -0,0 +1,62 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_PRECOMP_H__
+#define __OPENCV_PRECOMP_H__
+
+#include "opencv2/cudaimgproc.hpp"
+
+#include "opencv2/core/utility.hpp"
+#include "opencv2/core/private.hpp"
+#include "opencv2/core/private.cuda.hpp"
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifdef HAVE_OPENCV_CUDAARITHM
+#  include "opencv2/cudaarithm.hpp"
+#endif
+
+#ifdef HAVE_OPENCV_CUDAFILTERS
+#  include "opencv2/cudafilters.hpp"
+#endif
+
+#endif /* __OPENCV_PRECOMP_H__ */