moved GpuMat and DevMem2D to core module, some code refactoring

2011-11-09 13:13:52 +00:00 · 2011-11-09 13:13:52 +00:00 · fcfa72081e
commit fcfa72081e
parent 8a148e39f0
95 changed files with 18889 additions and 18485 deletions
--- a/modules/core/include/opencv2/core/core.hpp
+++ b/modules/core/include/opencv2/core/core.hpp
@ -90,6 +90,10 @@ class Mat;
 class SparseMat;
 typedef Mat MatND;

+namespace gpu {
+    class GpuMat;
+}
+
 class CV_EXPORTS MatExpr;
 class CV_EXPORTS MatOp_Base;
 class CV_EXPORTS MatArg;
@ -1627,6 +1631,10 @@ public:
    template<typename _Tp> explicit Mat(const Point3_<_Tp>& pt, bool copyData=true);
    //! builds matrix from comma initializer
    template<typename _Tp> explicit Mat(const MatCommaInitializer_<_Tp>& commaInitializer);
+
+    //! download data from GpuMat
+    explicit Mat(const gpu::GpuMat& m);
+
    //! destructor - calls release()
    ~Mat();
    //! assignment operators
--- a/modules/core/include/opencv2/core/devmem2d.hpp
+++ b/modules/core/include/opencv2/core/devmem2d.hpp
@ -0,0 +1,157 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_CORE_DevMem2D_HPP__
+#define __OPENCV_CORE_DevMem2D_HPP__
+
+#ifdef __CUDACC__ 
+    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
+#else
+    #define __CV_GPU_HOST_DEVICE__
+#endif
+
+namespace cv
+{    
+    namespace gpu
+    {
+        // Simple lightweight structures that encapsulates information about an image on device.
+        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
+
+        template <bool expr> struct StaticAssert;
+        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        
+
+		template<typename T> struct DevPtr
+		{
+			typedef T elem_type;
+			typedef int index_type;
+
+			enum { elem_size = sizeof(elem_type) };
+
+			T* data;
+
+			__CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
+			__CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
+
+			__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
+			__CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }
+			__CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
+		};
+		
+		template<typename T> struct PtrSz : public DevPtr<T>
+        {                     
+            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
+            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
+
+            size_t size;
+        };
+
+		template<typename T> struct PtrStep : public DevPtr<T>
+        {   
+            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
+			__CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
+
+            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
+            size_t step;            
+
+			__CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
+
+			__CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
+        };
+
+		template <typename T> struct PtrStepSz : public PtrStep<T>
+        {   
+            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
+            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) 
+                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
+
+            int cols;
+            int rows;                                                                              
+        };
+
+		template <typename T> struct DevMem2D_ : public PtrStepSz<T>
+        {            
+            DevMem2D_() {}
+			DevMem2D_(int rows_, int cols_, T* data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
+                            
+            template <typename U>            
+			explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}                                                                
+        };
+		               
+        template<typename T> struct PtrElemStep_ : public PtrStep<T>
+        {                   
+            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) 
+            {
+                StaticAssert<256 % sizeof(T) == 0>::check();
+
+                PtrStep<T>::step /= PtrStep<T>::elem_size;             
+            }
+            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }  
+
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  
+        };
+
+		template<typename T> struct PtrStep_ : public PtrStep<T>
+        {            
+            PtrStep_() {}
+            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}                        
+        };
+
+        typedef DevMem2D_<unsigned char> DevMem2Db;
+		typedef DevMem2Db DevMem2D;
+        typedef DevMem2D_<float> DevMem2Df;
+        typedef DevMem2D_<int> DevMem2Di;
+
+        typedef PtrStep<unsigned char> PtrStepb;
+        typedef PtrStep<float> PtrStepf;
+        typedef PtrStep<int> PtrStepi;
+
+        typedef PtrElemStep_<unsigned char> PtrElemStep;
+        typedef PtrElemStep_<float> PtrElemStepf;
+        typedef PtrElemStep_<int> PtrElemStepi;		
+    }    
+}
+
+#endif /* __OPENCV_GPU_DevMem2D_HPP__ */
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@ -0,0 +1,471 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other GpuMaterials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPUMAT_HPP__
+#define __OPENCV_GPUMAT_HPP__
+
+#include "opencv2/core/core.hpp"
+#include "opencv2/core/devmem2d.hpp"
+
+namespace cv { namespace gpu
+{
+    //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
+    class CV_EXPORTS GpuMat
+    {
+    public:
+        //! default constructor
+        GpuMat();
+
+        //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
+        GpuMat(int rows, int cols, int type);
+        GpuMat(Size size, int type);
+
+        //! constucts GpuMatrix and fills it with the specified value _s.
+        GpuMat(int rows, int cols, int type, Scalar s);
+        GpuMat(Size size, int type, Scalar s);
+
+        //! copy constructor
+        GpuMat(const GpuMat& m);
+
+        //! constructor for GpuMatrix headers pointing to user-allocated data
+        GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
+        GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
+
+        //! creates a matrix header for a part of the bigger matrix
+        GpuMat(const GpuMat& m, Range rowRange, Range colRange);
+        GpuMat(const GpuMat& m, Rect roi);
+        
+        //! builds GpuMat from Mat. Perfom blocking upload to device.
+        explicit GpuMat(const Mat& m);
+
+        //! destructor - calls release()
+        ~GpuMat();
+
+        //! assignment operators
+        GpuMat& operator = (const GpuMat& m);
+        
+        //! pefroms blocking upload data to GpuMat.
+        void upload(const Mat& m);
+
+        //! downloads data from device to host memory. Blocking calls.
+        void download(Mat& m) const;
+
+        //! returns a new GpuMatrix header for the specified row
+        GpuMat row(int y) const;
+        //! returns a new GpuMatrix header for the specified column
+        GpuMat col(int x) const;
+        //! ... for the specified row span
+        GpuMat rowRange(int startrow, int endrow) const;
+        GpuMat rowRange(Range r) const;
+        //! ... for the specified column span
+        GpuMat colRange(int startcol, int endcol) const;
+        GpuMat colRange(Range r) const;
+
+        //! returns deep copy of the GpuMatrix, i.e. the data is copied
+        GpuMat clone() const;
+        //! copies the GpuMatrix content to "m".
+        // It calls m.create(this->size(), this->type()).
+        void copyTo(GpuMat& m) const;
+        //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
+        void copyTo(GpuMat& m, const GpuMat& mask) const;
+        //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
+        void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
+
+        void assignTo(GpuMat& m, int type=-1) const;
+
+        //! sets every GpuMatrix element to s
+        GpuMat& operator = (Scalar s);
+        //! sets some of the GpuMatrix elements to s, according to the mask
+        GpuMat& setTo(Scalar s, const GpuMat& mask = GpuMat());
+        //! creates alternative GpuMatrix header for the same data, with different
+        // number of channels and/or different number of rows. see cvReshape.
+        GpuMat reshape(int cn, int rows = 0) const;
+
+        //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
+        // previous data is unreferenced if needed.
+        void create(int rows, int cols, int type);
+        void create(Size size, int type);
+        //! decreases reference counter;
+        // deallocate the data when reference counter reaches 0.
+        void release();
+
+        //! swaps with other smart pointer
+        void swap(GpuMat& mat);
+
+        //! locates GpuMatrix header within a parent GpuMatrix. See below
+        void locateROI(Size& wholeSize, Point& ofs) const;
+        //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
+        GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
+        //! extracts a rectangular sub-GpuMatrix
+        // (this is a generalized form of row, rowRange etc.)
+        GpuMat operator()(Range rowRange, Range colRange) const;
+        GpuMat operator()(Rect roi) const;
+
+        //! returns true iff the GpuMatrix data is continuous
+        // (i.e. when there are no gaps between successive rows).
+        // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
+        bool isContinuous() const;
+        //! returns element size in bytes,
+        // similar to CV_ELEM_SIZE(cvMat->type)
+        size_t elemSize() const;
+        //! returns the size of element channel in bytes.
+        size_t elemSize1() const;
+        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
+        int type() const;
+        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
+        int depth() const;
+        //! returns element type, similar to CV_MAT_CN(cvMat->type)
+        int channels() const;
+        //! returns step/elemSize1()
+        size_t step1() const;
+        //! returns GpuMatrix size:
+        // width == number of columns, height == number of rows
+        Size size() const;
+        //! returns true if GpuMatrix data is NULL
+        bool empty() const;
+
+        //! returns pointer to y-th row
+        uchar* ptr(int y = 0);
+        const uchar* ptr(int y = 0) const;
+
+        //! template version of the above method
+        template<typename _Tp> _Tp* ptr(int y = 0);
+        template<typename _Tp> const _Tp* ptr(int y = 0) const;
+
+        template <typename _Tp> operator DevMem2D_<_Tp>() const;
+        template <typename _Tp> operator PtrStep_<_Tp>() const;
+
+        /*! includes several bit-fields:
+        - the magic signature
+        - continuity flag
+        - depth
+        - number of channels
+        */
+        int flags;
+
+        //! the number of rows and columns
+        int rows, cols;
+
+        //! a distance between successive rows in bytes; includes the gap if any
+        size_t step;
+
+        //! pointer to the data
+        uchar* data;
+
+        //! pointer to the reference counter;
+        // when GpuMatrix points to user-allocated data, the pointer is NULL
+        int* refcount;
+
+        //! helper fields used in locateROI and adjustROI
+        uchar* datastart;
+        uchar* dataend;
+    };
+
+    //! Creates continuous GPU matrix
+    CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
+    CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);
+    CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);
+    CV_EXPORTS GpuMat createContinuous(Size size, int type);
+
+    //! Ensures that size of the given matrix is not less than (rows, cols) size
+    //! and matrix type is match specified one too
+    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
+    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
+
+    class CV_EXPORTS GpuFuncTable
+    {
+    public:
+        virtual ~GpuFuncTable() {}
+
+        virtual void copy(const Mat& src, GpuMat& dst) const = 0;
+        virtual void copy(const GpuMat& src, Mat& dst) const = 0;
+        virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
+
+        virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
+
+        virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
+        virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
+
+        virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
+
+        virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
+        virtual void free(void* devPtr) const = 0;
+    };
+
+    CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl);
+
+    ////////////////////////////////////////////////////////////////////////
+
+    inline GpuMat::GpuMat() 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) 
+    {
+    }
+
+    inline GpuMat::GpuMat(int rows_, int cols_, int type_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (rows_ > 0 && cols_ > 0)
+            create(rows_, cols_, type_);
+    }
+
+    inline GpuMat::GpuMat(Size size_, int type_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (size_.height > 0 && size_.width > 0)
+            create(size_.height, size_.width, type_);
+    }
+
+    inline GpuMat::GpuMat(int rows_, int cols_, int type_, Scalar s_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (rows_ > 0 && cols_ > 0)
+        {
+            create(rows_, cols_, type_);
+            setTo(s_);
+        }
+    }
+
+    inline GpuMat::GpuMat(Size size_, int type_, Scalar s_) 
+        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
+    {
+        if (size_.height > 0 && size_.width > 0)
+        {
+            create(size_.height, size_.width, type_);
+            setTo(s_);
+        }
+    }    
+
+    inline GpuMat::~GpuMat() 
+    { 
+        release(); 
+    }
+
+    inline GpuMat GpuMat::clone() const
+    {
+        GpuMat m;
+        copyTo(m);
+        return m;
+    }
+
+    inline void GpuMat::assignTo(GpuMat& m, int type) const
+    {
+        if (type < 0)
+            m = *this;
+        else
+            convertTo(m, type);
+    }
+
+    inline size_t GpuMat::step1() const 
+    { 
+        return step / elemSize1(); 
+    }
+
+    inline bool GpuMat::empty() const 
+    { 
+        return data == 0; 
+    }
+
+    template<typename _Tp> inline _Tp* GpuMat::ptr(int y)
+    {
+        return (_Tp*)ptr(y);
+    }
+
+    template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const
+    {
+        return (const _Tp*)ptr(y);
+    }
+
+    inline void swap(GpuMat& a, GpuMat& b) 
+    { 
+        a.swap(b); 
+    }
+
+    inline GpuMat GpuMat::row(int y) const 
+    { 
+        return GpuMat(*this, Range(y, y+1), Range::all()); 
+    }
+
+    inline GpuMat GpuMat::col(int x) const 
+    { 
+        return GpuMat(*this, Range::all(), Range(x, x+1)); 
+    }
+
+    inline GpuMat GpuMat::rowRange(int startrow, int endrow) const 
+    { 
+        return GpuMat(*this, Range(startrow, endrow), Range::all()); 
+    }
+
+    inline GpuMat GpuMat::rowRange(Range r) const 
+    { 
+        return GpuMat(*this, r, Range::all()); 
+    }
+
+    inline GpuMat GpuMat::colRange(int startcol, int endcol) const 
+    { 
+        return GpuMat(*this, Range::all(), Range(startcol, endcol)); 
+    }
+
+    inline GpuMat GpuMat::colRange(Range r) const 
+    { 
+        return GpuMat(*this, Range::all(), r); 
+    }
+
+    inline void GpuMat::create(Size size_, int type_) 
+    { 
+        create(size_.height, size_.width, type_); 
+    }
+
+    inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const 
+    { 
+        return GpuMat(*this, rowRange, colRange); 
+    }
+
+    inline GpuMat GpuMat::operator()(Rect roi) const 
+    { 
+        return GpuMat(*this, roi); 
+    }
+
+    inline bool GpuMat::isContinuous() const 
+    { 
+        return (flags & Mat::CONTINUOUS_FLAG) != 0; 
+    }
+
+    inline size_t GpuMat::elemSize() const 
+    { 
+        return CV_ELEM_SIZE(flags); 
+    }
+
+    inline size_t GpuMat::elemSize1() const 
+    { 
+        return CV_ELEM_SIZE1(flags); 
+    }
+
+    inline int GpuMat::type() const 
+    { 
+        return CV_MAT_TYPE(flags); 
+    }
+
+    inline int GpuMat::depth() const 
+    { 
+        return CV_MAT_DEPTH(flags); 
+    }
+
+    inline int GpuMat::channels() const 
+    { 
+        return CV_MAT_CN(flags); 
+    }
+
+    inline Size GpuMat::size() const 
+    { 
+        return Size(cols, rows); 
+    }
+
+    inline uchar* GpuMat::ptr(int y)
+    {
+        CV_DbgAssert((unsigned)y < (unsigned)rows);
+        return data + step * y;
+    }
+
+    inline const uchar* GpuMat::ptr(int y) const
+    {
+        CV_DbgAssert((unsigned)y < (unsigned)rows);
+        return data + step * y;
+    }
+
+    inline GpuMat& GpuMat::operator = (Scalar s)
+    {
+        setTo(s);
+        return *this;
+    }
+
+    template <class T> inline GpuMat::operator DevMem2D_<T>() const 
+    { 
+        return DevMem2D_<T>(rows, cols, (T*)data, step); 
+    }
+
+    template <class T> inline GpuMat::operator PtrStep_<T>() const 
+    { 
+        return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); 
+    }
+
+    inline GpuMat createContinuous(int rows, int cols, int type)
+    {
+        GpuMat m;
+        createContinuous(rows, cols, type, m);
+        return m;
+    }
+
+    inline void createContinuous(Size size, int type, GpuMat& m)
+    {
+        createContinuous(size.height, size.width, type, m);
+    }
+
+    inline GpuMat createContinuous(Size size, int type)
+    {
+        GpuMat m;
+        createContinuous(size, type, m);
+        return m;
+    }
+
+    inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)
+    {
+        ensureSizeIsEnough(size.height, size.width, type, m);
+    }
+
+    inline void createContinuous(int rows, int cols, int type, GpuMat& m)
+    {
+        int area = rows * cols;
+        if (!m.isContinuous() || m.type() != type || m.size().area() != area)
+            m.create(1, area, type);
+        m = m.reshape(0, rows);
+    }
+
+    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
+    {
+        if (m.type() == type && m.rows >= rows && m.cols >= cols)
+            m = m(Rect(0, 0, cols, rows));
+        else
+            m.create(rows, cols, type);
+    }
+}}
+
+#endif // __OPENCV_GPUMAT_HPP__
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@ -0,0 +1,460 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+#include "opencv2/core/gpumat.hpp"
+
+using namespace std;
+using namespace cv;
+using namespace cv::gpu;
+
+cv::gpu::GpuMat::GpuMat(const GpuMat& m) 
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
+{
+    if (refcount)
+        CV_XADD(refcount, 1);
+}
+
+cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : 
+    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_), 
+    step(step_), data((uchar*)data_), refcount(0),
+    datastart((uchar*)data_), dataend((uchar*)data_)
+{
+    size_t minstep = cols * elemSize();
+
+    if (step == Mat::AUTO_STEP)
+    {
+        step = minstep;
+        flags |= Mat::CONTINUOUS_FLAG;
+    }
+    else
+    {
+        if (rows == 1) 
+            step = minstep;
+
+        CV_DbgAssert(step >= minstep);
+
+        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
+    }
+    dataend += step * (rows - 1) + minstep;
+}
+
+cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : 
+    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width),
+    step(step_), data((uchar*)data_), refcount(0),
+    datastart((uchar*)data_), dataend((uchar*)data_)
+{
+    size_t minstep = cols * elemSize();
+
+    if (step == Mat::AUTO_STEP)
+    {
+        step = minstep;
+        flags |= Mat::CONTINUOUS_FLAG;
+    }
+    else
+    {
+        if (rows == 1) 
+            step = minstep;
+
+        CV_DbgAssert(step >= minstep);
+
+        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
+    }
+    dataend += step * (rows - 1) + minstep;
+}
+
+cv::gpu::GpuMat::GpuMat(const GpuMat& m, Range rowRange, Range colRange)
+{
+    flags = m.flags;
+    step = m.step; refcount = m.refcount;
+    data = m.data; datastart = m.datastart; dataend = m.dataend;
+
+    if (rowRange == Range::all())
+        rows = m.rows;
+    else
+    {
+        CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows);
+
+        rows = rowRange.size();
+        data += step*rowRange.start;
+    }
+
+    if (colRange == Range::all())
+        cols = m.cols;
+    else
+    {
+        CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols);
+
+        cols = colRange.size();
+        data += colRange.start*elemSize();
+        flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
+    }
+
+    if (rows == 1)
+        flags |= Mat::CONTINUOUS_FLAG;
+
+    if (refcount)
+        CV_XADD(refcount, 1);
+
+    if (rows <= 0 || cols <= 0)
+        rows = cols = 0;
+}
+
+cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) : 
+    flags(m.flags), rows(roi.height), cols(roi.width),
+    step(m.step), data(m.data + roi.y*step), refcount(m.refcount),
+    datastart(m.datastart), dataend(m.dataend)
+{
+    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
+    data += roi.x * elemSize();
+
+    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows);
+
+    if (refcount)
+        CV_XADD(refcount, 1);
+
+    if (rows <= 0 || cols <= 0)
+        rows = cols = 0;
+}
+
+cv::gpu::GpuMat::GpuMat(const Mat& m) : 
+    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) 
+{ 
+    upload(m); 
+}
+
+GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m)
+{
+    if (this != &m)
+    {
+        GpuMat temp(m);
+        swap(temp);
+    }
+
+    return *this;
+}
+
+void cv::gpu::GpuMat::swap(GpuMat& b)
+{
+    std::swap(flags, b.flags);
+    std::swap(rows, b.rows); 
+    std::swap(cols, b.cols);
+    std::swap(step, b.step); 
+    std::swap(data, b.data);
+    std::swap(datastart, b.datastart);
+    std::swap(dataend, b.dataend);
+    std::swap(refcount, b.refcount);
+}
+
+void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
+{
+    size_t esz = elemSize();
+    ptrdiff_t delta1 = data - datastart;
+    ptrdiff_t delta2 = dataend - datastart;
+
+    CV_DbgAssert(step > 0);
+
+    if (delta1 == 0)
+        ofs.x = ofs.y = 0;
+    else
+    {
+        ofs.y = static_cast<int>(delta1 / step);
+        ofs.x = static_cast<int>((delta1 - step * ofs.y) / esz);
+
+        CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz);
+    }
+
+    size_t minstep = (ofs.x + cols) * esz;
+
+    wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / step + 1), ofs.y + rows);
+    wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);
+}
+
+GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)
+{
+    Size wholeSize; 
+    Point ofs;
+    locateROI(wholeSize, ofs);
+
+    size_t esz = elemSize();
+
+    int row1 = std::max(ofs.y - dtop, 0); 
+    int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
+
+    int col1 = std::max(ofs.x - dleft, 0);
+    int col2 = std::min(ofs.x + cols + dright, wholeSize.width);
+
+    data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;
+    rows = row2 - row1; 
+    cols = col2 - col1;
+
+    if (esz * cols == step || rows == 1)
+        flags |= Mat::CONTINUOUS_FLAG;
+    else
+        flags &= ~Mat::CONTINUOUS_FLAG;
+
+    return *this;
+}
+
+GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
+{
+    GpuMat hdr = *this;
+
+    int cn = channels();
+    if (new_cn == 0)
+        new_cn = cn;
+
+    int total_width = cols * cn;
+
+    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
+        new_rows = rows * total_width / new_cn;
+
+    if (new_rows != 0 && new_rows != rows)
+    {
+        int total_size = total_width * rows;
+
+        if (!isContinuous())
+            CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
+
+        if ((unsigned)new_rows > (unsigned)total_size)
+            CV_Error(CV_StsOutOfRange, "Bad new number of rows");
+
+        total_width = total_size / new_rows;
+
+        if (total_width * new_rows != total_size)
+            CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
+
+        hdr.rows = new_rows;
+        hdr.step = total_width * elemSize1();
+    }
+
+    int new_width = total_width / new_cn;
+
+    if (new_width * new_cn != total_width)
+        CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
+
+    hdr.cols = new_width;
+    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
+
+    return hdr;
+}
+
+cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), refcount(0), datastart(0), dataend(0), datalimit(0), allocator(0), size(&rows)
+{
+    m.download(*this);
+}
+
+namespace
+{
+    void throw_nogpu() 
+    { 
+        CV_Error(CV_GpuNotSupported, "The library is compiled without GPU support"); 
+    }
+
+    class EmptyFuncTable : public GpuFuncTable
+    {
+    public:
+        void copy(const Mat&, GpuMat&) const { throw_nogpu(); }
+        void copy(const GpuMat&, Mat&) const { throw_nogpu(); }
+        void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); }
+
+        void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); }
+
+        void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); }
+        void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); }
+
+        void setTo(GpuMat&, Scalar, const GpuMat&) const { throw_nogpu(); }
+
+        void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); }
+        void free(void*) const {}
+    };
+
+    const GpuFuncTable* g_funcTbl = 0;
+
+    const GpuFuncTable* gpuFuncTable()
+    {
+        static EmptyFuncTable empty;
+        return g_funcTbl ? g_funcTbl : &empty;
+    }
+}
+
+void cv::gpu::setGpuFuncTable(const GpuFuncTable* funcTbl)
+{
+    g_funcTbl = funcTbl;
+}
+
+void cv::gpu::GpuMat::upload(const Mat& m)
+{
+    CV_DbgAssert(!m.empty());
+
+    create(m.size(), m.type());
+
+    gpuFuncTable()->copy(m, *this);
+}
+
+void cv::gpu::GpuMat::download(Mat& m) const
+{
+    CV_DbgAssert(!empty());
+
+    m.create(size(), type());
+
+    gpuFuncTable()->copy(*this, m);
+}
+
+void cv::gpu::GpuMat::copyTo(GpuMat& m) const
+{
+    CV_DbgAssert(!empty());
+
+    m.create(size(), type());
+
+    gpuFuncTable()->copy(*this, m);
+}
+
+void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
+{
+    if (mask.empty())
+        copyTo(mat);
+    else
+    {
+        mat.create(size(), type());
+
+        gpuFuncTable()->copyWithMask(*this, mat, mask);
+    }
+}
+
+void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const
+{
+    bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();
+
+    if (rtype < 0)
+        rtype = type();
+    else
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
+
+    int sdepth = depth();
+    int ddepth = CV_MAT_DEPTH(rtype);
+    if (sdepth == ddepth && noScale)
+    {
+        copyTo(dst);
+        return;
+    }
+
+    GpuMat temp;
+    const GpuMat* psrc = this;
+    if (sdepth != ddepth && psrc == &dst)
+    {
+        temp = *this;
+        psrc = &temp;
+    }
+
+    dst.create(size(), rtype);
+
+    if (noScale)
+        gpuFuncTable()->convert(*psrc, dst);
+    else
+        gpuFuncTable()->convert(*psrc, dst, alpha, beta);
+}
+
+GpuMat& cv::gpu::GpuMat::setTo(Scalar s, const GpuMat& mask)
+{
+    CV_Assert(mask.empty() || mask.type() == CV_8UC1);
+    CV_DbgAssert(!empty());
+
+    gpuFuncTable()->setTo(*this, s, mask);    
+
+    return *this;
+}
+
+void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
+{
+    _type &= TYPE_MASK;
+
+    if (rows == _rows && cols == _cols && type() == _type && data)
+        return;
+
+    if (data)
+        release();
+
+    CV_DbgAssert(_rows >= 0 && _cols >= 0);
+
+    if (_rows > 0 && _cols > 0)
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+
+        size_t esz = elemSize();
+
+        void* devPtr;
+        gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows);
+
+        // Single row must be continuous
+        if (rows == 1)
+            step = esz * cols;
+
+        if (esz * cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        int64 _nettosize = static_cast<int64>(step) * rows;
+        size_t nettosize = static_cast<size_t>(_nettosize);
+
+        datastart = data = static_cast<uchar*>(devPtr);
+        dataend = data + nettosize;
+
+        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
+        *refcount = 1;
+    }
+}
+
+void cv::gpu::GpuMat::release()
+{
+    if (refcount && CV_XADD(refcount, -1) == 1)
+    {
+        fastFree(refcount);
+
+        gpuFuncTable()->free(datastart);
+    }
+
+    data = datastart = dataend = 0;
+    step = rows = cols = 0;
+    refcount = 0;
+}
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@ -3,7 +3,8 @@ set(name "gpu")
 set(the_target "opencv_${name}")
 project(${the_target})

-set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann" "opencv_calib3d") #"opencv_features2d" "opencv_flann" "opencv_objdetect" - only headers needed 
+set(DEPS "opencv_core" "opencv_imgproc" "opencv_calib3d" "opencv_objdetect")
+set(DEPS_HEADER ${DEPS} "opencv_features2d" "opencv_flann")
 set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)

 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
@ -27,6 +28,13 @@ file(GLOB lib_device_hdrs_detail "src/opencv2/gpu/device/detail/*.h*")
 source_group("Device" FILES ${lib_device_hdrs})
 source_group("Device\\Detail" FILES ${lib_device_hdrs_detail})

+foreach(d ${DEPS_HEADER})
+	if(${d} MATCHES "opencv_")
+		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
+		include_directories("${d_dir}/include")
+	endif()
+endforeach()
+
 if (HAVE_CUDA)
    file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp")	
    file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu")
@ -51,7 +59,6 @@ if (HAVE_CUDA)
        set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;")
    endif()

-
    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
    string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
@ -60,7 +67,7 @@ if (HAVE_CUDA)
        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
        #string(REPLACE "/W4" "/W3" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4211 /wd4201 /wd4100 /wd4505 /wd4408 /wd4251")

        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
        string(REPLACE "/EHsc-" "/EHs" CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE}")
@ -71,20 +78,17 @@ if (HAVE_CUDA)
    endif()

    if (BUILD_SHARED_LIBS)
-		set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
-	endif()
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-DCVAPI_EXPORTS")
+    endif()
+    
+    if(MSVC)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/wd4251")
+    endif()

    CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda})
    #CUDA_BUILD_CLEAN_TARGET()
 endif()

-foreach(d ${DEPS})
-	if(${d} MATCHES "opencv_")
-		string(REPLACE "opencv_" "${CMAKE_CURRENT_SOURCE_DIR}/../" d_dir ${d})
-		include_directories("${d_dir}/include")
-	endif()
-endforeach()
-
 add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${lib_device_hdrs_detail} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs})

 # For dynamic link numbering convenions
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@ -40,122 +40,4 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DevMem2D_HPP__
-#define __OPENCV_GPU_DevMem2D_HPP__
-
-
-namespace cv
-{    
-    namespace gpu
-    {
-        // Simple lightweight structures that encapsulates information about an image on device.
-        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
-
-#if defined(__CUDACC__) 
-    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
-#else
-    #define __CV_GPU_HOST_DEVICE__
-#endif
-
-        template <bool expr> struct StaticAssert;
-        template <> struct StaticAssert<true> {static __CV_GPU_HOST_DEVICE__ void check(){}};        
-
-		template<typename T> struct DevPtr
-		{
-			typedef T elem_type;
-			typedef int index_type;
-
-			enum { elem_size = sizeof(elem_type) };
-
-			T* data;
-
-			__CV_GPU_HOST_DEVICE__ DevPtr() : data(0) {}
-			__CV_GPU_HOST_DEVICE__ DevPtr(T* data_) : data(data_) {}
-
-			__CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
-			__CV_GPU_HOST_DEVICE__ operator       T*()       { return data; }
-			__CV_GPU_HOST_DEVICE__ operator const T*() const { return data; }
-		};
-		
-		template<typename T> struct PtrSz : public DevPtr<T>
-        {                     
-            __CV_GPU_HOST_DEVICE__ PtrSz() : size(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrSz(T* data_, size_t size_) : DevPtr<T>(data_), size(size_) {}
-
-            size_t size;
-        };
-
-		template<typename T> struct PtrStep : public DevPtr<T>
-        {   
-            __CV_GPU_HOST_DEVICE__ PtrStep() : step(0) {}
-			__CV_GPU_HOST_DEVICE__ PtrStep(T* data_, size_t step_) : DevPtr<T>(data_), step(step_) {}
-
-            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
-            size_t step;            
-
-			__CV_GPU_HOST_DEVICE__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
-
-			__CV_GPU_HOST_DEVICE__       T& operator ()(int y, int x)       { return ptr(y)[x]; }
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
-        };
-
-		template <typename T> struct PtrStepSz : public PtrStep<T>
-        {   
-            __CV_GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {}
-            __CV_GPU_HOST_DEVICE__ PtrStepSz(int rows_, int cols_, T* data_, size_t step_) 
-                : PtrStep<T>(data_, step_), cols(cols_), rows(rows_) {}
-
-            int cols;
-            int rows;                                                                              
-        };
-
-		template <typename T> struct DevMem2D_ : public PtrStepSz<T>
-        {            
-            DevMem2D_() {}
-			DevMem2D_(int rows_, int cols_, T *data_, size_t step_) : PtrStepSz<T>(rows_, cols_, data_, step_) {}
-                            
-            template <typename U>            
-			explicit DevMem2D_(const DevMem2D_<U>& d) : PtrStepSz<T>(d.rows, d.cols, (T*)d.data, d.step) {}                                                                
-        };
-		               
-        template<typename T> struct PtrElemStep_ : public PtrStep<T>
-        {                   
-            PtrElemStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) 
-            {
-                StaticAssert<256 % sizeof(T) == 0>::check();
-
-                PtrStep<T>::step /= PtrStep<T>::elem_size;             
-            }
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep<T>::data + y * PtrStep<T>::step; }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep<T>::data + y * PtrStep<T>::step; }  
-
-            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
-            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  
-        };
-
-		template<typename T> struct PtrStep_ : public PtrStep<T>
-        {            
-            PtrStep_() {}
-            PtrStep_(const DevMem2D_<T>& mem) : PtrStep<T>(mem.data, mem.step) {}                        
-        };
-
-#undef __CV_GPU_HOST_DEVICE__
-
-
-        typedef DevMem2D_<unsigned char> DevMem2Db;
-		typedef DevMem2Db DevMem2D;
-        typedef DevMem2D_<float> DevMem2Df;
-        typedef DevMem2D_<int> DevMem2Di;
-
-        typedef PtrStep<unsigned char> PtrStepb;
-        typedef PtrStep<float> PtrStepf;
-        typedef PtrStep<int> PtrStepi;
-
-        typedef PtrElemStep_<unsigned char> PtrElemStep;
-        typedef PtrElemStep_<float> PtrElemStepf;
-        typedef PtrElemStep_<int> PtrElemStepi;		
-    }    
-}
-
-#endif /* __OPENCV_GPU_DevMem2D_HPP__ */
+#include "opencv2/core/devmem2d.hpp"
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
--- a/modules/gpu/include/opencv2/gpu/gpumat.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpumat.hpp
@ -40,427 +40,4 @@
 //
 //M*/

-#ifndef __OPENCV_GPUMAT_HPP__
-#define __OPENCV_GPUMAT_HPP__
-
-#include "opencv2/core/core.hpp"
-#include "opencv2/gpu/devmem2d.hpp"
-
-namespace cv { namespace gpu
-{
-    //! Smart pointer for GPU memory with reference counting. Its interface is mostly similar with cv::Mat.
-    class CV_EXPORTS GpuMat
-    {
-    public:
-        //! returns lightweight DevMem2D_ structure for passing to nvcc-compiled code.
-        // Contains just image size, data ptr and step.
-        template <class T> operator DevMem2D_<T>() const;
-        template <class T> operator PtrStep_<T>() const;
-		template <class T> operator PtrStep<T>() const;
-
-
-
-
-
-        //! builds GpuMat from Mat. Perfom blocking upload to device.
-        explicit GpuMat(const Mat& m);
-
-        //! pefroms blocking upload data to GpuMat.
-        void upload(const Mat& m);
-
-        //! downloads data from device to host memory. Blocking calls.
-        void download(Mat& m) const;
-        operator Mat() const
-        {
-            Mat m;
-            download(m);
-            return m;
-        }
-
-
-
-
-
-
-        //! default constructor
-        GpuMat();
-
-        //! constructs GpuMatrix of the specified size and type (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
-        GpuMat(int rows, int cols, int type);
-        GpuMat(Size size, int type);
-
-        //! constucts GpuMatrix and fills it with the specified value _s.
-        GpuMat(int rows, int cols, int type, const Scalar& s);
-        GpuMat(Size size, int type, const Scalar& s);
-
-        //! copy constructor
-        GpuMat(const GpuMat& m);
-
-        //! constructor for GpuMatrix headers pointing to user-allocated data
-        GpuMat(int rows, int cols, int type, void* data, size_t step = Mat::AUTO_STEP);
-        GpuMat(Size size, int type, void* data, size_t step = Mat::AUTO_STEP);
-
-        //! creates a matrix header for a part of the bigger matrix
-        GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange);
-        GpuMat(const GpuMat& m, const Rect& roi);
-
-        //! destructor - calls release()
-        ~GpuMat();
-
-        //! assignment operators
-        GpuMat& operator = (const GpuMat& m);
-
-        //! returns a new GpuMatrix header for the specified row
-        GpuMat row(int y) const;
-        //! returns a new GpuMatrix header for the specified column
-        GpuMat col(int x) const;
-        //! ... for the specified row span
-        GpuMat rowRange(int startrow, int endrow) const;
-        GpuMat rowRange(const Range& r) const;
-        //! ... for the specified column span
-        GpuMat colRange(int startcol, int endcol) const;
-        GpuMat colRange(const Range& r) const;
-
-        //! returns deep copy of the GpuMatrix, i.e. the data is copied
-        GpuMat clone() const;
-        //! copies the GpuMatrix content to "m".
-        // It calls m.create(this->size(), this->type()).
-        void copyTo(GpuMat& m) const;
-        //! copies those GpuMatrix elements to "m" that are marked with non-zero mask elements.
-        void copyTo(GpuMat& m, const GpuMat& mask) const;
-        //! converts GpuMatrix to another datatype with optional scalng. See cvConvertScale.
-        void convertTo(GpuMat& m, int rtype, double alpha = 1, double beta = 0) const;
-
-        void assignTo(GpuMat& m, int type=-1) const;
-
-        //! sets every GpuMatrix element to s
-        GpuMat& operator = (const Scalar& s);
-        //! sets some of the GpuMatrix elements to s, according to the mask
-        GpuMat& setTo(const Scalar& s, const GpuMat& mask = GpuMat());
-        //! creates alternative GpuMatrix header for the same data, with different
-        // number of channels and/or different number of rows. see cvReshape.
-        GpuMat reshape(int cn, int rows = 0) const;
-
-        //! allocates new GpuMatrix data unless the GpuMatrix already has specified size and type.
-        // previous data is unreferenced if needed.
-        void create(int rows, int cols, int type);
-        void create(Size size, int type);
-        //! decreases reference counter;
-        // deallocate the data when reference counter reaches 0.
-        void release();
-
-        //! swaps with other smart pointer
-        void swap(GpuMat& mat);
-
-        //! locates GpuMatrix header within a parent GpuMatrix. See below
-        void locateROI(Size& wholeSize, Point& ofs) const;
-        //! moves/resizes the current GpuMatrix ROI inside the parent GpuMatrix.
-        GpuMat& adjustROI(int dtop, int dbottom, int dleft, int dright);
-        //! extracts a rectangular sub-GpuMatrix
-        // (this is a generalized form of row, rowRange etc.)
-        GpuMat operator()(Range rowRange, Range colRange) const;
-        GpuMat operator()(const Rect& roi) const;
-
-        //! returns true iff the GpuMatrix data is continuous
-        // (i.e. when there are no gaps between successive rows).
-        // similar to CV_IS_GpuMat_CONT(cvGpuMat->type)
-        bool isContinuous() const;
-        //! returns element size in bytes,
-        // similar to CV_ELEM_SIZE(cvMat->type)
-        size_t elemSize() const;
-        //! returns the size of element channel in bytes.
-        size_t elemSize1() const;
-        //! returns element type, similar to CV_MAT_TYPE(cvMat->type)
-        int type() const;
-        //! returns element type, similar to CV_MAT_DEPTH(cvMat->type)
-        int depth() const;
-        //! returns element type, similar to CV_MAT_CN(cvMat->type)
-        int channels() const;
-        //! returns step/elemSize1()
-        size_t step1() const;
-        //! returns GpuMatrix size:
-        // width == number of columns, height == number of rows
-        Size size() const;
-        //! returns true if GpuMatrix data is NULL
-        bool empty() const;
-
-        //! returns pointer to y-th row
-        uchar* ptr(int y = 0);
-        const uchar* ptr(int y = 0) const;
-
-        //! template version of the above method
-        template<typename _Tp> _Tp* ptr(int y = 0);
-        template<typename _Tp> const _Tp* ptr(int y = 0) const;
-
-        /*! includes several bit-fields:
-        - the magic signature
-        - continuity flag
-        - depth
-        - number of channels
-        */
-        int flags;
-
-        //! the number of rows and columns
-        int rows, cols;
-
-        //! a distance between successive rows in bytes; includes the gap if any
-        size_t step;
-
-        //! pointer to the data
-        uchar* data;
-
-        //! pointer to the reference counter;
-        // when GpuMatrix points to user-allocated data, the pointer is NULL
-        int* refcount;
-
-        //! helper fields used in locateROI and adjustROI
-        uchar* datastart;
-        uchar* dataend;
-    };
-
-    //! Creates continuous GPU matrix
-    CV_EXPORTS void createContinuous(int rows, int cols, int type, GpuMat& m);
-    CV_EXPORTS GpuMat createContinuous(int rows, int cols, int type);
-    CV_EXPORTS void createContinuous(Size size, int type, GpuMat& m);
-    CV_EXPORTS GpuMat createContinuous(Size size, int type);
-
-    //! Ensures that size of the given matrix is not less than (rows, cols) size
-    //! and matrix type is match specified one too
-    CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
-    CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
-
-    ////////////////////////////////////////////////////////////////////////
-
-    template <class T> inline GpuMat::operator DevMem2D_<T>() const { return DevMem2D_<T>(rows, cols, (T*)data, step); }
-    template <class T> inline GpuMat::operator PtrStep_<T>() const { return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this)); }
-	template <class T> inline GpuMat::operator PtrStep<T>() const { return PtrStep<T>((T*)data, step); }	
-
-
-
-
-
-
-    inline GpuMat::GpuMat() 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) 
-    {
-    }
-
-    inline GpuMat::GpuMat(int rows_, int cols_, int type_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (rows_ > 0 && cols_ > 0)
-            create(rows_, cols_, type_);
-    }
-
-    inline GpuMat::GpuMat(Size size_, int type_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (size_.height > 0 && size_.width > 0)
-            create(size_.height, size_.width, type_);
-    }
-
-    inline GpuMat::GpuMat(int rows_, int cols_, int type_, const Scalar& s_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (rows_ > 0 && cols_ > 0)
-        {
-            create(rows_, cols_, type_);
-            setTo(s_);
-        }
-    }
-
-    inline GpuMat::GpuMat(Size size_, int type_, const Scalar& s_) 
-        : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0)
-    {
-        if (size_.height > 0 && size_.width > 0)
-        {
-            create(size_.height, size_.width, type_);
-            setTo(s_);
-        }
-    }
-
-    inline GpuMat::~GpuMat() 
-    { 
-        release(); 
-    }
-
-    inline GpuMat GpuMat::clone() const
-    {
-        GpuMat m;
-        copyTo(m);
-        return m;
-    }
-
-    inline void GpuMat::assignTo(GpuMat& m, int type) const
-    {
-        if (type < 0)
-            m = *this;
-        else
-            convertTo(m, type);
-    }
-
-    inline size_t GpuMat::step1() const 
-    { 
-        return step / elemSize1(); 
-    }
-
-    inline bool GpuMat::empty() const 
-    { 
-        return data == 0; 
-    }
-
-    template<typename _Tp> inline _Tp* GpuMat::ptr(int y)
-    {
-        return (_Tp*)ptr(y);
-    }
-
-    template<typename _Tp> inline const _Tp* GpuMat::ptr(int y) const
-    {
-        return (const _Tp*)ptr(y);
-    }
-
-    inline void swap(GpuMat& a, GpuMat& b) 
-    { 
-        a.swap(b); 
-    }
-
-    inline GpuMat GpuMat::row(int y) const 
-    { 
-        return GpuMat(*this, Range(y, y+1), Range::all()); 
-    }
-
-    inline GpuMat GpuMat::col(int x) const 
-    { 
-        return GpuMat(*this, Range::all(), Range(x, x+1)); 
-    }
-
-    inline GpuMat GpuMat::rowRange(int startrow, int endrow) const 
-    { 
-        return GpuMat(*this, Range(startrow, endrow), Range::all()); 
-    }
-
-    inline GpuMat GpuMat::rowRange(const Range& r) const 
-    { 
-        return GpuMat(*this, r, Range::all()); 
-    }
-
-    inline GpuMat GpuMat::colRange(int startcol, int endcol) const 
-    { 
-        return GpuMat(*this, Range::all(), Range(startcol, endcol)); 
-    }
-
-    inline GpuMat GpuMat::colRange(const Range& r) const 
-    { 
-        return GpuMat(*this, Range::all(), r); 
-    }
-
-    inline void GpuMat::create(Size size_, int type_) 
-    { 
-        create(size_.height, size_.width, type_); 
-    }
-
-    inline GpuMat GpuMat::operator()(Range rowRange, Range colRange) const 
-    { 
-        return GpuMat(*this, rowRange, colRange); 
-    }
-
-    inline GpuMat GpuMat::operator()(const Rect& roi) const 
-    { 
-        return GpuMat(*this, roi); 
-    }
-
-    inline bool GpuMat::isContinuous() const 
-    { 
-        return (flags & Mat::CONTINUOUS_FLAG) != 0; 
-    }
-
-    inline size_t GpuMat::elemSize() const 
-    { 
-        return CV_ELEM_SIZE(flags); 
-    }
-
-    inline size_t GpuMat::elemSize1() const 
-    { 
-        return CV_ELEM_SIZE1(flags); 
-    }
-
-    inline int GpuMat::type() const 
-    { 
-        return CV_MAT_TYPE(flags); 
-    }
-
-    inline int GpuMat::depth() const 
-    { 
-        return CV_MAT_DEPTH(flags); 
-    }
-
-    inline int GpuMat::channels() const 
-    { 
-        return CV_MAT_CN(flags); 
-    }
-
-    inline Size GpuMat::size() const 
-    { 
-        return Size(cols, rows); 
-    }
-
-    inline unsigned char* GpuMat::ptr(int y)
-    {
-        CV_DbgAssert((unsigned)y < (unsigned)rows);
-        return data + step * y;
-    }
-
-    inline const unsigned char* GpuMat::ptr(int y) const
-    {
-        CV_DbgAssert((unsigned)y < (unsigned)rows);
-        return data + step * y;
-    }
-
-    inline GpuMat& GpuMat::operator = (const Scalar& s)
-    {
-        setTo(s);
-        return *this;
-    }
-
-    inline GpuMat createContinuous(int rows, int cols, int type)
-    {
-        GpuMat m;
-        createContinuous(rows, cols, type, m);
-        return m;
-    }
-
-    inline void createContinuous(Size size, int type, GpuMat& m)
-    {
-        createContinuous(size.height, size.width, type, m);
-    }
-
-    inline GpuMat createContinuous(Size size, int type)
-    {
-        GpuMat m;
-        createContinuous(size, type, m);
-        return m;
-    }
-
-    inline void ensureSizeIsEnough(Size size, int type, GpuMat& m)
-    {
-        ensureSizeIsEnough(size.height, size.width, type, m);
-    }
-
-    inline void createContinuous(int rows, int cols, int type, GpuMat& m)
-    {
-        int area = rows * cols;
-        if (!m.isContinuous() || m.type() != type || m.size().area() != area)
-            m.create(1, area, type);
-        m = m.reshape(0, rows);
-    }
-
-    inline void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
-    {
-        if (m.type() == type && m.rows >= rows && m.cols >= cols)
-            m = m(Rect(0, 0, cols, rows));
-        else
-            m.create(rows, cols, type);
-    }
-}}
-
-#endif // __OPENCV_GPUMAT_HPP__
+#include "opencv2/core/gpumat.hpp"
--- a/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
+++ b/modules/gpu/include/opencv2/gpu/matrix_operations.hpp
@ -1,142 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other GpuMaterials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
-#define __OPENCV_GPU_MATRIX_OPERATIONS_HPP__
-
-namespace cv
-{
-
-namespace gpu
-{
-///////////////////////////////////////////////////////////////////////
-//////////////////////////////// CudaMem ////////////////////////////////
-///////////////////////////////////////////////////////////////////////
-
-inline CudaMem::CudaMem()  : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) {}
-inline CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _rows > 0 && _cols > 0 )
-        create( _rows, _cols, _type, _alloc_type);
-}
-
-inline CudaMem::CudaMem(Size _size, int _type, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( _size.height > 0 && _size.width > 0 )
-        create( _size.height, _size.width, _type, _alloc_type);
-}
-
-inline CudaMem::CudaMem(const CudaMem& m) : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
-{
-    if( refcount )
-        CV_XADD(refcount, 1);
-}
-
-inline CudaMem::CudaMem(const Mat& m, int _alloc_type) : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
-{
-    if( m.rows > 0 && m.cols > 0 )
-        create( m.size(), m.type(), _alloc_type);
-
-    Mat tmp = createMatHeader();
-    m.copyTo(tmp);
-}
-
-inline CudaMem::~CudaMem()
-{
-    release();
-
-}
-
-inline CudaMem& CudaMem::operator = (const CudaMem& m)
-{
-    if( this != &m )
-    {
-        if( m.refcount )
-            CV_XADD(m.refcount, 1);
-        release();
-        flags = m.flags;
-        rows = m.rows; cols = m.cols;
-        step = m.step; data = m.data;
-        datastart = m.datastart;
-        dataend = m.dataend;
-        refcount = m.refcount;
-        alloc_type = m.alloc_type;
-    }
-    return *this;
-}
-
-inline CudaMem CudaMem::clone() const
-{
-    CudaMem m(size(), type(), alloc_type);
-    Mat to = m;
-    Mat from = *this;
-    from.copyTo(to);
-    return m;
-}
-
-inline void CudaMem::create(Size _size, int _type, int _alloc_type) { create(_size.height, _size.width, _type, _alloc_type); }
-
-
-//CCP void CudaMem::create(int _rows, int _cols, int _type, int _alloc_type);
-//CPP void CudaMem::release();
-
-inline Mat CudaMem::createMatHeader() const { return Mat(size(), type(), data, step); }
-inline CudaMem::operator Mat() const { return createMatHeader(); }
-
-inline CudaMem::operator GpuMat() const { return createGpuMatHeader(); }
-//CPP GpuMat CudaMem::createGpuMatHeader() const;
-
-inline bool CudaMem::isContinuous() const { return (flags & Mat::CONTINUOUS_FLAG) != 0; }
-inline size_t CudaMem::elemSize() const { return CV_ELEM_SIZE(flags); }
-inline size_t CudaMem::elemSize1() const { return CV_ELEM_SIZE1(flags); }
-inline int CudaMem::type() const { return CV_MAT_TYPE(flags); }
-inline int CudaMem::depth() const { return CV_MAT_DEPTH(flags); }
-inline int CudaMem::channels() const { return CV_MAT_CN(flags); }
-inline size_t CudaMem::step1() const { return step/elemSize1(); }
-inline Size CudaMem::size() const { return Size(cols, rows); }
-inline bool CudaMem::empty() const { return data == 0; }
-
-} /* end of namespace gpu */
-
-} /* end of namespace cv */
-
-#endif /* __OPENCV_GPU_MATRIX_OPERATIONS_HPP__ */
--- a/modules/gpu/perf/perf_arithm.cpp
+++ b/modules/gpu/perf/perf_arithm.cpp
@ -24,7 +24,7 @@ PERF_TEST_P(DevInfo_Size_MatType, transpose, testing::Combine(testing::ValuesIn(
        transpose(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -55,7 +55,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, flip, testing::Combine(testing::Value
        flip(src, dst, flipCode);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_Size_MatType, LUT, testing::Combine(testing::ValuesIn(device
        LUT(src, lut, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -115,8 +115,8 @@ PERF_TEST_P(DevInfo_Size, cartToPolar, testing::Combine(testing::ValuesIn(device
        cartToPolar(x, y, magnitude, angle);
    }

-    Mat magnitude_host = magnitude;
-    Mat angle_host = angle;
+    Mat magnitude_host(magnitude);
+    Mat angle_host(angle);

    SANITY_CHECK(magnitude_host);
    SANITY_CHECK(angle_host);
@ -147,8 +147,8 @@ PERF_TEST_P(DevInfo_Size, polarToCart, testing::Combine(testing::ValuesIn(device
        polarToCart(magnitude, angle, x, y);
    }

-    Mat x_host = x;
-    Mat y_host = angle;
+    Mat x_host(x);
+    Mat y_host(y);

    SANITY_CHECK(x_host);
    SANITY_CHECK(y_host);
@ -180,7 +180,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addMat, testing::Combine(testing::ValuesIn(dev
        add(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@ -210,7 +210,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addScalar, testing::Combine(testing::ValuesIn(
        add(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@ -241,7 +241,7 @@ PERF_TEST_P(DevInfo_Size_MatType, subtractMat, testing::Combine(testing::ValuesI
        subtract(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@ -270,7 +270,7 @@ PERF_TEST_P(DevInfo_Size, multiplyMat, testing::Combine(testing::ValuesIn(device
        multiply(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@ -300,7 +300,7 @@ PERF_TEST_P(DevInfo_Size_MatType, multiplyScalar, testing::Combine(testing::Valu
        multiply(a, b, c);
    }

-    Mat c_host = c;
+    Mat c_host(c);

    SANITY_CHECK(c_host);
 }
@ -327,7 +327,7 @@ PERF_TEST_P(DevInfo_Size, exp, testing::Combine(testing::ValuesIn(devices()),
        exp(a, b);
    }

-    Mat b_host = b;
+    Mat b_host(b);

    SANITY_CHECK(b_host);
 }
@ -356,7 +356,7 @@ PERF_TEST_P(DevInfo_Size_MatType, pow, testing::Combine(testing::ValuesIn(device
        pow(src, 2.0, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -389,7 +389,7 @@ PERF_TEST_P(DevInfo_Size_MatType_CmpOp, compare, testing::Combine(testing::Value
        compare(src1, src2, dst, cmpop);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -418,7 +418,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_not, testing::Combine(testing::ValuesI
        bitwise_not(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -449,7 +449,7 @@ PERF_TEST_P(DevInfo_Size_MatType, bitwise_and, testing::Combine(testing::ValuesI
        bitwise_and(src1, src2, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -480,7 +480,7 @@ PERF_TEST_P(DevInfo_Size_MatType, min, testing::Combine(testing::ValuesIn(device
        min(src1, src2, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -712,7 +712,7 @@ PERF_TEST_P(DevInfo_Size_MatType, addWeighted, testing::Combine(testing::ValuesI
        addWeighted(src1, 0.5, src2, 0.5, 0.0, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -743,7 +743,7 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val
        reduce(src, dst, dim, CV_REDUCE_MIN);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -774,7 +774,7 @@ PERF_TEST_P(DevInfo_Size, gemm, testing::Combine(testing::ValuesIn(devices()),
        gemm(src1, src2, 1.0, src3, 1.0, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/perf/perf_calib3d.cpp
+++ b/modules/gpu/perf/perf_calib3d.cpp
@ -20,7 +20,7 @@ PERF_TEST_P(DevInfo, transformPoints, testing::ValuesIn(devices()))
        transformPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -45,7 +45,7 @@ PERF_TEST_P(DevInfo, projectPoints, testing::ValuesIn(devices()))
        projectPoints(src, Mat::ones(1, 3, CV_32FC1), Mat::ones(1, 3, CV_32FC1), Mat::ones(3, 3, CV_32FC1), Mat(), dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/perf/perf_filters.cpp
+++ b/modules/gpu/perf/perf_filters.cpp
@ -28,7 +28,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, boxFilter, testing::Combine(testing
        filter->apply(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MorphOp_KernelSize, morphologyFilter, testing::
        filter->apply(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -96,7 +96,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, linearFilter, testing::Combine(test
        filter->apply(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -130,7 +130,7 @@ PERF_TEST_P(DevInfo_Size_MatType_KernelSize, separableLinearFilter, testing::Com
        filter->apply(src, dst, Rect(0, 0, src.cols, src.rows));
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@ -36,7 +36,7 @@ PERF_TEST_P(DevInfo_Size_MatType_Interpolation_BorderMode, remap, testing::Combi
        remap(src, dst, xmap, ymap, interpolation, borderMode);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -63,7 +63,7 @@ PERF_TEST_P(DevInfo, meanShiftFiltering, testing::ValuesIn(devices()))
        meanShiftFiltering(src, dst, 50, 50);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -91,8 +91,8 @@ PERF_TEST_P(DevInfo, meanShiftProc, testing::ValuesIn(devices()))
        meanShiftProc(src, dstr, dstsp, 50, 50);
    }

-    Mat dstr_host = dstr;
-    Mat dstsp_host = dstsp;
+    Mat dstr_host(dstr);
+    Mat dstsp_host(dstsp);

    SANITY_CHECK(dstr_host);
    SANITY_CHECK(dstsp_host);
--- a/modules/gpu/perf/perf_matop.cpp
+++ b/modules/gpu/perf/perf_matop.cpp
@ -25,7 +25,7 @@ PERF_TEST_P(DevInfo_Size_MatType, merge, testing::Combine(testing::ValuesIn(devi
        merge(src, dst);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -82,7 +82,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setTo, testing::Combine(testing::ValuesIn(devi
        src.setTo(val);
    }

-    Mat src_host = src;
+    Mat src_host(src);

    SANITY_CHECK(src_host);
 }
@ -115,7 +115,7 @@ PERF_TEST_P(DevInfo_Size_MatType, setToMasked, testing::Combine(testing::ValuesI
        src.setTo(val, mask);
    }

-    src_host = src;
+    src.download(src_host);

    SANITY_CHECK(src_host);
 }
@ -148,7 +148,7 @@ PERF_TEST_P(DevInfo_Size_MatType, copyToMasked, testing::Combine(testing::Values
        src.copyTo(dst, mask);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
@ -182,7 +182,7 @@ PERF_TEST_P(DevInfo_Size_MatType_MatType, convertTo, testing::Combine(testing::V
        src.convertTo(dst, type2, a, b);
    }

-    Mat dst_host = dst;
+    Mat dst_host(dst);

    SANITY_CHECK(dst_host);
 }
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@ -425,16 +425,22 @@ void cv::gpu::magnitudeSqr(const GpuMat& src, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // Polar <-> Cart

-namespace cv { namespace gpu { namespace mathfunc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace mathfunc 
 {
-    void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream);
-    void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);
-}}}
+    void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+    void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    inline void cartToPolar_caller(const GpuMat& x, const GpuMat& y, GpuMat* mag, bool magSqr, GpuMat* angle, bool angleInDegrees, cudaStream_t stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+
        CV_DbgAssert(x.size() == y.size() && x.type() == y.type());
        CV_Assert(x.depth() == CV_32F);

@ -448,11 +454,13 @@ namespace
        GpuMat mag1cn = mag ? mag->reshape(1) : GpuMat();
        GpuMat angle1cn = angle ? angle->reshape(1) : GpuMat();

-        mathfunc::cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
+        cartToPolar_gpu(x1cn, y1cn, mag1cn, magSqr, angle1cn, angleInDegrees, stream);
    }

    inline void polarToCart_caller(const GpuMat& mag, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, cudaStream_t stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ mathfunc;
+
        CV_DbgAssert((mag.empty() || mag.size() == angle.size()) && mag.type() == angle.type());
        CV_Assert(mag.depth() == CV_32F);

@ -464,34 +472,33 @@ namespace
        GpuMat x1cn = x.reshape(1);
        GpuMat y1cn = y.reshape(1);

-        mathfunc::polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
+        polarToCart_gpu(mag1cn, angle1cn, x1cn, y1cn, angleInDegrees, stream);
    }
 }

 void cv::gpu::magnitude(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, &dst, false, 0, false, StreamAccessor::getStream(stream));
 }

 void cv::gpu::magnitudeSqr(const GpuMat& x, const GpuMat& y, GpuMat& dst, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, &dst, true, 0, false, StreamAccessor::getStream(stream));
 }

 void cv::gpu::phase(const GpuMat& x, const GpuMat& y, GpuMat& angle, bool angleInDegrees, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, 0, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
 }

 void cv::gpu::cartToPolar(const GpuMat& x, const GpuMat& y, GpuMat& mag, GpuMat& angle, bool angleInDegrees, Stream& stream)
 {
-    ::cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
+    cartToPolar_caller(x, y, &mag, false, &angle, angleInDegrees, StreamAccessor::getStream(stream));
 }

 void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat& x, GpuMat& y, bool angleInDegrees, Stream& stream)
 {
-    ::polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
+    polarToCart_caller(magnitude, angle, x, y, angleInDegrees, StreamAccessor::getStream(stream));
 }

-
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@ -55,13 +55,19 @@ void cv::gpu::DisparityBilateralFilter::operator()(const GpuMat&, const GpuMat&,

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace bf 
-{
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream);
-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream);
-}}}
+namespace bilateral_filter
+{
+    void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
+
+    void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+    void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE_ bilateral_filter;

 namespace
 {
@ -105,7 +111,7 @@ namespace
        short edge_disc = max<short>(short(1), short(ndisp * edge_threshold + 0.5));
        short max_disc = short(ndisp * max_disc_threshold + 0.5);

-        bf::load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);
+        load_constants(table_color.ptr<float>(), table_space, ndisp, radius, edge_disc, max_disc);

        if (&dst != &disp)
        {
@ -115,7 +121,7 @@ namespace
                disp.copyTo(dst);
        }

-        bf::bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+        bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
    }

    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, 
--- a/modules/gpu/src/blend.cpp
+++ b/modules/gpu/src/blend.cpp
@ -52,15 +52,19 @@ void cv::gpu::blendLinear(const GpuMat&, const GpuMat&, const GpuMat&, const Gpu

 #else

-namespace cv { namespace gpu 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace blend
 {
    template <typename T>
-    void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, 
-                           const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream);
+    void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream);

-    void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, 
-                               const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream);
-}}
+    void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE_ blend;

 void cv::gpu::blendLinear(const GpuMat& img1, const GpuMat& img2, const GpuMat& weights1, const GpuMat& weights2, 
                          GpuMat& result, Stream& stream)
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@ -82,7 +82,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, vector< vec

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace bf_match
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace bf_match
 {
    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
        const DevMem2Di& trainIdx, const DevMem2Df& distance, 
@ -103,9 +105,9 @@ namespace cv { namespace gpu { namespace bf_match
    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
        int cc, cudaStream_t stream);
-}}}
+}

-namespace cv { namespace gpu { namespace bf_knnmatch
+namespace bf_knnmatch
 {
    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
        const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
@ -126,9 +128,9 @@ namespace cv { namespace gpu { namespace bf_knnmatch
    template <typename T> void match2Hamming_gpu(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
        const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
        int cc, cudaStream_t stream);
-}}}
+}

-namespace cv { namespace gpu { namespace bf_radius_match 
+namespace bf_radius_match 
 {
    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@ -151,15 +153,17 @@ namespace cv { namespace gpu { namespace bf_radius_match
    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
        int cc, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE
+
+////////////////////////////////////////////////////////////////////
+// Train collection

 cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
 {
 }

-////////////////////////////////////////////////////////////////////
-// Train collection
-
 void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)
 {
    trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
@ -195,7 +199,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& query, const
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::bf_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance,
@ -242,8 +246,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
    if (trainIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat distanceCPU(distance);

    matchConvert(trainIdxCPU, distanceCPU, matches);
 }
@ -337,7 +341,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& query, c
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace cv::gpu::bf_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, 
@ -384,9 +388,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat imgIdxCPU(imgIdx);
+    Mat distanceCPU(distance);

    matchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, matches);
 }
@ -448,7 +452,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchSingle(const GpuMat& query, co
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::bf_knnmatch;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, int k, const DevMem2Db& mask, 
                             const DevMem2Db& trainIdx, const DevMem2Db& distance, const DevMem2Df& allDist, 
@ -511,8 +515,8 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
    if (trainIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat distanceCPU(distance);

    knnMatchConvert(trainIdxCPU, distanceCPU, matches, compactResult);
 }
@ -577,7 +581,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Collection(const GpuMat& quer
    if (query.empty() || trainCollection.empty())
        return;

-    using namespace cv::gpu::bf_knnmatch;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_knnmatch;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& trains, const DevMem2D_<PtrStepb>& masks, 
                             const DevMem2Db& trainIdx, const DevMem2Db& imgIdx, const DevMem2Db& distance, 
@ -630,9 +634,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Download(const GpuMat& trainI
    if (trainIdx.empty() || imgIdx.empty() || distance.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
+    Mat trainIdxCPU(trainIdx);
+    Mat imgIdxCPU(imgIdx);
+    Mat distanceCPU(distance);

    knnMatch2Convert(trainIdxCPU, imgIdxCPU, distanceCPU, matches, compactResult);
 }
@ -758,7 +762,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchSingle(const GpuMat& query,
    if (query.empty() || train.empty())
        return;

-    using namespace cv::gpu::bf_radius_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
                             const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@ -819,9 +823,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
    if (trainIdx.empty() || distance.empty() || nMatches.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat distanceCPU = distance;
-    Mat nMatchesCPU = nMatches;
+    Mat trainIdxCPU(trainIdx);
+    Mat distanceCPU(distance);
+    Mat nMatchesCPU(nMatches);

    radiusMatchConvert(trainIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
 }
@ -889,7 +893,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchCollection(const GpuMat& qu
    if (query.empty() || empty())
        return;

-    using namespace cv::gpu::bf_radius_match;
+    using namespace OPENCV_DEVICE_NAMESPACE_ bf_radius_match;

    typedef void (*caller_t)(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
                             const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
@ -953,10 +957,10 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
    if (trainIdx.empty() || imgIdx.empty() || distance.empty() || nMatches.empty())
        return;

-    Mat trainIdxCPU = trainIdx;
-    Mat imgIdxCPU = imgIdx;
-    Mat distanceCPU = distance;
-    Mat nMatchesCPU = nMatches;
+    Mat trainIdxCPU(trainIdx);
+    Mat imgIdxCPU(imgIdx);
+    Mat distanceCPU(distance);
+    Mat nMatchesCPU(nMatches);

    radiusMatchConvert(trainIdxCPU, imgIdxCPU, distanceCPU, nMatchesCPU, matches, compactResult);
 }
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@ -42,6 +42,10 @@

 #include "precomp.hpp"

+using namespace cv;
+using namespace cv::gpu;
+using namespace std;
+
 #if !defined(HAVE_CUDA)

 void cv::gpu::transformPoints(const GpuMat&, const Mat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
@ -52,13 +56,31 @@ void cv::gpu::solvePnPRansac(const Mat&, const Mat&, const Mat&, const Mat&, Mat

 #else

-using namespace cv;
-using namespace cv::gpu;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace transform_points 
+namespace transform_points 
 {
    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
-}}}
+}
+
+namespace project_points 
+{
+    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
+}
+
+namespace solve_pnp_ransac
+{
+    int maxNumIters();
+
+    void computeHypothesisScores(
+            const int num_hypotheses, const int num_points, const float* rot_matrices,
+            const float3* transl_vectors, const float3* object, const float2* image,
+            const float dist_threshold, int* hypothesis_scores);
+}
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE;

 namespace
 {
@ -79,15 +101,9 @@ namespace

 void cv::gpu::transformPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, GpuMat& dst, Stream& stream)
 {
-    ::transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
+    transformPointsCaller(src, rvec, tvec, dst, StreamAccessor::getStream(stream));
 }

-namespace cv { namespace gpu { namespace project_points 
-{
-    void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
-}}}
-
-
 namespace
 {
    void projectPointsCaller(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, cudaStream_t stream)
@ -109,20 +125,9 @@ namespace

 void cv::gpu::projectPoints(const GpuMat& src, const Mat& rvec, const Mat& tvec, const Mat& camera_mat, const Mat& dist_coef, GpuMat& dst, Stream& stream)
 {
-    ::projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
+    projectPointsCaller(src, rvec, tvec, camera_mat, dist_coef, dst, StreamAccessor::getStream(stream));
 }

-
-namespace cv { namespace gpu { namespace solve_pnp_ransac
-{
-    int maxNumIters();
-
-    void computeHypothesisScores(
-            const int num_hypotheses, const int num_points, const float* rot_matrices,
-            const float3* transl_vectors, const float3* object, const float2* image,
-            const float dist_threshold, int* hypothesis_scores);
-}}}
-
 namespace
 {
    // Selects subset_size random different points from [0, num_points - 1] range
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@ -46,7 +46,6 @@ using namespace cv;
 using namespace cv::gpu;
 using namespace std;

-
 #if !defined (HAVE_CUDA)

 cv::gpu::CascadeClassifier_GPU::CascadeClassifier_GPU()  { throw_nogpu(); }
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@ -51,155 +51,158 @@ void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu {  namespace device  
-{
-    #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
-        void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name) \
+    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);

-    #define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
-        OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)
+#define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _16u) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)
+#define OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _32f) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_8u) \
+    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _full_32f)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_rgba)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgb_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgra_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(rgba_to_bgr565)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(gray_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr555)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(gray_to_bgr565)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr555_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ONE(bgr565_to_gray)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_gray)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_gray)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_yuv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_yuv4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(yuv4_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_YCrCb4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_YCrCb4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(YCrCb4_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgb_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(rgba_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgr_to_xyz4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(bgra_to_xyz4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_ALL(xyz4_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hsv4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hsv4)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hsv4_to_bgra)

-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgb_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(rgba_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgr_to_hls4)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(bgra_to_hls4)

-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
-    #undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
-}}}
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgb)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_rgba)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls_to_bgra)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgr)
+OPENCV_GPU_DECLARE_CVTCOLOR_8U32F(hls4_to_bgra)
+
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_ONE
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_ALL
+#undef OPENCV_GPU_DECLARE_CVTCOLOR_8U32F
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE;

 namespace
 {
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@ -45,421 +45,423 @@
 #include "opencv2/gpu/device/vec_distance.hpp"
 #include "opencv2/gpu/device/datamov_utils.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace bf_radius_match
+namespace bf_radius_match {
+
+///////////////////////////////////////////////////////////////////////////////
+// Match Unrolled
+
+template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
 {
-    ///////////////////////////////////////////////////////////////////////////////
-    // Match Unrolled
+    #if __CUDA_ARCH__ >= 110

-    template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-    __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
-        PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+    extern __shared__ int smem[];
+
+    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+    Dist dist;
+
+    #pragma unroll
+    for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    {
-        #if __CUDA_ARCH__ >= 110
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;

-        extern __shared__ int smem[];
+        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;

-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+        if (loadX < query.cols)
+        {
+            T val;

-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;

-        Dist dist;
+            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
+        }
+
+        __syncthreads();

        #pragma unroll
-        for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
-        {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
+        for (int j = 0; j < BLOCK_SIZE; ++j)
+            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);

-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-            if (loadX < query.cols)
-            {
-                T val;
-
-                ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                ForceGlob<T>::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-            __syncthreads();
-        }
-
-        float distVal = (typename Dist::result_type)dist;
-
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-        {
-            unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-            if (ind < maxCount)
-            {
-                bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                bestDistance.ptr(queryIdx)[ind] = distVal;
-            }
-        }
-
-        #endif
+        __syncthreads();
    }

-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
+    float distVal = (typename Dist::result_type)dist;
+
+    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
    {
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+        if (ind < maxCount)
+        {
+            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+            bestDistance.ptr(queryIdx)[ind] = distVal;
+        }
+    }
+
+    #endif
+}
+
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> 
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
+{
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}   
+
+template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
+void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    cudaStream_t stream)
+{
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    for (int i = 0; i < n; ++i)
+    {
+        const DevMem2D_<T> train = trains[i];
+
        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));

-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-        matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
-            trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+        if (masks != 0 && masks[i].data)
+        {
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+        }
+        else
+        {
+            matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+        }
        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T> 
-    void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        cudaStream_t stream)
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Match
+
+template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
+    PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+{
+    #if __CUDA_ARCH__ >= 110
+
+    extern __shared__ int smem[];
+
+    const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
+    const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+
+    typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
+    typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
+
+    Dist dist;
+
+    for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
    {
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+        const int loadX = threadIdx.x + i * BLOCK_SIZE;

-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+        s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
+        s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;

-        for (int i = 0; i < n; ++i)
+        if (loadX < query.cols)
        {
-            const DevMem2D_<T> train = trains[i];
+            T val;

-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+            ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
+            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;

-            if (masks != 0 && masks[i].data)
-            {
-                matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-            }
-            else
-            {
-                matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-            }
-            cudaSafeCall( cudaGetLastError() );
+            ForceGlob<T>::Load(train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
+            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
        }

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        __syncthreads();
+
+        #pragma unroll
+        for (int j = 0; j < BLOCK_SIZE; ++j)
+            dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
+
+        __syncthreads();
    }

-    ///////////////////////////////////////////////////////////////////////////////
-    // Match
+    float distVal = (typename Dist::result_type)dist;

-    template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
-    __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
-        PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
+    if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
    {
-        #if __CUDA_ARCH__ >= 110
-
-        extern __shared__ int smem[];
-
-        const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
-        const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
-
-        typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
-        typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
-
-        Dist dist;
-
-        for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
+        unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
+        if (ind < maxCount)
        {
-            const int loadX = threadIdx.x + i * BLOCK_SIZE;
-
-            s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
-            s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
-
-            if (loadX < query.cols)
-            {
-                T val;
-
-                ForceGlob<T>::Load(query.ptr(min(queryIdx, query.rows - 1)), loadX, val);
-                s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
-
-                ForceGlob<T>::Load(train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
-                s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int j = 0; j < BLOCK_SIZE; ++j)
-                dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
-
-            __syncthreads();
+            bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
+            if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
+            bestDistance.ptr(queryIdx)[ind] = distVal;
        }
-
-        float distVal = (typename Dist::result_type)dist;
-
-        if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
-        {
-            unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
-            if (ind < maxCount)
-            {
-                bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
-                if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
-                bestDistance.ptr(queryIdx)[ind] = distVal;
-            }
-        }
-
-        #endif
    }

-    template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        cudaStream_t stream)
+    #endif
+}
+
+template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> 
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    cudaStream_t stream)
+{
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+    const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
+
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
+        trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+template <int BLOCK_SIZE, typename Dist, typename T> 
+void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    cudaStream_t stream)
+{
+    const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+
+    const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
+
+    for (int i = 0; i < n; ++i)
    {
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
+        const DevMem2D_<T> train = trains[i];
+
        const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));

-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-        match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask, 
-            trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
+        if (masks != 0 && masks[i].data)
+        {
+            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+        }
+        else
+        {
+            match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
+                trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
+        }
        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    template <int BLOCK_SIZE, typename Dist, typename T> 
-    void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        cudaStream_t stream)
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Match dispatcher
+
+template <typename Dist, typename T, typename Mask> 
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
+                     const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+                     int cc, cudaStream_t stream)
+{
+    if (query.cols <= 64)
    {
-        const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
-
-        const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
-
-        for (int i = 0; i < n; ++i)
-        {
-            const DevMem2D_<T> train = trains[i];
-
-            const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
-
-            if (masks != 0 && masks[i].data)
-            {
-                match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]), 
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-            }
-            else
-            {
-                match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(), 
-                    trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
-            }
-            cudaSafeCall( cudaGetLastError() );
-        }
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
    }
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Match dispatcher
-
-    template <typename Dist, typename T, typename Mask> 
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask, 
-                         const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-                         int cc, cudaStream_t stream)
+    else if (query.cols <= 128)
    {
-        if (query.cols <= 64)
-        {
-            matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-        }
-        else if (query.cols <= 128)
-        {
-            matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-        }
-        /*else if (query.cols <= 256)
-        {
-            matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-        }
-        else if (query.cols <= 512)
-        {            
-            matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-        }
-        else if (query.cols <= 1024)
-        {            
-            matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-        }*/
-        else
-        {
-            match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
-        }
+        matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
    }
-
-    template <typename Dist, typename T> 
-    void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
-                         const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-                         int cc, cudaStream_t stream)
+    /*else if (query.cols <= 256)
    {
-        if (query.cols <= 64)
-        {
-            matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-        }
-        else if (query.cols <= 128)
-        {
-            matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-        }
-        /*else if (query.cols <= 256)
-        {
-            matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-        }
-        else if (query.cols <= 512)
-        {            
-            matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-        }
-        else if (query.cols <= 1024)
-        {            
-            matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-        }*/
-        else
-        {
-            match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
-        }
+        matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
    }
-    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match caller
-
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream)
+    else if (query.cols <= 512)
    {            
-        if (mask.data)
-        {
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                trainIdx, distance, nMatches, 
-                cc, stream);
-        }
-        else
-        {
-            matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                trainIdx, distance, nMatches, 
-                cc, stream);
-        }
+        matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
    }
-
-    template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream)
+    else if (query.cols <= 1024)
    {            
-        if (mask.data)
-        {
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                trainIdx, distance, nMatches, 
-                cc, stream);
-        }
-        else
-        {
-            matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                trainIdx, distance, nMatches, 
-                cc, stream);
-        }
+        matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
+    }*/
+    else
+    {
+        match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
    }
+}

-    //template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
-        const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream)
+template <typename Dist, typename T> 
+void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2Db* masks, 
+                     const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+                     int cc, cudaStream_t stream)
+{
+    if (query.cols <= 64)
    {
-        if (mask.data)
-        {
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
-                trainIdx, distance, nMatches, 
-                cc, stream);
-        }
-        else
-        {
-            matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
-                trainIdx, distance, nMatches, 
-                cc, stream);
-        }
+        matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
    }
-
-    template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-    template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream)
+    else if (query.cols <= 128)
    {
-        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-            trainIdx, imgIdx, distance, nMatches, 
+        matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+    }
+    /*else if (query.cols <= 256)
+    {
+        matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+    }
+    else if (query.cols <= 512)
+    {            
+        matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+    }
+    else if (query.cols <= 1024)
+    {            
+        matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+    }*/
+    else
+    {
+        match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
+    }
+} 
+
+///////////////////////////////////////////////////////////////////////////////
+// Radius Match caller
+
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    int cc, cudaStream_t stream)
+{
+    if (mask.data)
+    {
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+            trainIdx, distance, nMatches, 
            cc, stream);
    }
-
-    template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-
-    template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream)
+    else
    {
-        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-            trainIdx, imgIdx, distance, nMatches, 
+        matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+            trainIdx, distance, nMatches, 
            cc, stream);
    }
+}

-    //template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL1_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);

-    template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
-        const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
-        int cc, cudaStream_t stream)
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    int cc, cudaStream_t stream)
+{
+    if (mask.data)
    {
-        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
-            trainIdx, imgIdx, distance, nMatches, 
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+            trainIdx, distance, nMatches, 
            cc, stream);
    }
+    else
+    {
+        matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+            trainIdx, distance, nMatches, 
+            cc, stream);
+    }
+}

-    template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    //template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-    template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
-}}}
+//template void matchL2_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL2_gpu<float >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db& train, float maxDistance, const DevMem2Db& mask, 
+    const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    int cc, cudaStream_t stream)
+{
+    if (mask.data)
+    {
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask), 
+            trainIdx, distance, nMatches, 
+            cc, stream);
+    }
+    else
+    {
+        matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(), 
+            trainIdx, distance, nMatches, 
+            cc, stream);
+    }
+}
+
+template void matchHamming_gpu<uchar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchHamming_gpu<schar >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchHamming_gpu<ushort>(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchHamming_gpu<short >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchHamming_gpu<int   >(const DevMem2Db& queryDescs, const DevMem2Db& trainDescs, float maxDistance, const DevMem2Db& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+template <typename T> void matchL1_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    int cc, cudaStream_t stream)
+{
+    matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+        trainIdx, imgIdx, distance, nMatches, 
+        cc, stream);
+}
+
+template void matchL1_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL1_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL1_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+template <typename T> void matchL2_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    int cc, cudaStream_t stream)
+{
+    matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+        trainIdx, imgIdx, distance, nMatches, 
+        cc, stream);
+}
+
+//template void matchL2_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchL2_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchL2_gpu<float >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+template <typename T> void matchHamming_gpu(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, 
+    const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, 
+    int cc, cudaStream_t stream)
+{
+    matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks, 
+        trainIdx, imgIdx, distance, nMatches, 
+        cc, stream);
+}
+
+template void matchHamming_gpu<uchar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchHamming_gpu<schar >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchHamming_gpu<ushort>(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+//template void matchHamming_gpu<short >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+template void matchHamming_gpu<int   >(const DevMem2Db& query, const DevMem2Db* trains, int n, float maxDistance, const DevMem2Db* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc, cudaStream_t stream);
+
+} // namespace bf_radius_match
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@ -43,191 +43,186 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace bf_krnls
+namespace bilateral_filter {
+
+__constant__ float* ctable_color;
+__constant__ float* ctable_space;
+__constant__ size_t ctable_space_step;
+
+__constant__ int cndisp;
+__constant__ int cradius;
+
+__constant__ short cedge_disc;
+__constant__ short cmax_disc;
+
+void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
 {
-    __constant__ float* ctable_color;
-    __constant__ float* ctable_space;
-    __constant__ size_t ctable_space_step;
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
+    size_t table_space_step = table_space.step / sizeof(float);
+    cudaSafeCall( cudaMemcpyToSymbol(ctable_space_step, &table_space_step, sizeof(size_t)) );

-    __constant__ int cndisp;
-    __constant__ int cradius;
+    cudaSafeCall( cudaMemcpyToSymbol(cndisp, &ndisp, sizeof(int)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cradius, &radius, sizeof(int)) );

-    __constant__ short cedge_disc;
-    __constant__ short cmax_disc;
+    cudaSafeCall( cudaMemcpyToSymbol(cedge_disc, &edge_disc, sizeof(short)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc, &max_disc, sizeof(short)) );
 }

-namespace cv { namespace gpu { namespace bf 
+template <int channels>
+struct DistRgbMax
 {
-    void load_constants(float* table_color, const DevMem2Df& table_space, int ndisp, int radius, short edge_disc, short max_disc)
+    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
    {
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_color, &table_color, sizeof(table_color)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space, &table_space.data, sizeof(table_space.data)) );
-        size_t table_space_step = table_space.step / sizeof(float);
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::ctable_space_step, &table_space_step, sizeof(size_t)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cndisp, &ndisp, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cradius, &radius, sizeof(int)) );
-
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cedge_disc, &edge_disc, sizeof(short)) );
-        cudaSafeCall( cudaMemcpyToSymbol(bf_krnls::cmax_disc, &max_disc, sizeof(short)) );
+        uchar x = ::abs(a[0] - b[0]);
+        uchar y = ::abs(a[1] - b[1]);
+        uchar z = ::abs(a[2] - b[2]);
+        return (::max(::max(x, y), z));
    }
-}}}
+};

-namespace bf_krnls
+template <>
+struct DistRgbMax<1>
 {
-    template <int channels>
-    struct DistRgbMax
+    static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
    {
-        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
-        {
-            uchar x = abs(a[0] - b[0]);
-            uchar y = abs(a[1] - b[1]);
-            uchar z = abs(a[2] - b[2]);
-            return (max(max(x, y), z));
-        }
-    };
+        return ::abs(a[0] - b[0]);
+    }
+};

-    template <>
-    struct DistRgbMax<1>
+template <int channels, typename T>
+__global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+    T dp[5];
+
+    if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
    {
-        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
+        dp[0] = *(disp + (y  ) * disp_step + x + 0);
+        dp[1] = *(disp + (y-1) * disp_step + x + 0);
+        dp[2] = *(disp + (y  ) * disp_step + x - 1);
+        dp[3] = *(disp + (y+1) * disp_step + x + 0);
+        dp[4] = *(disp + (y  ) * disp_step + x + 1);
+
+        if(::abs(dp[1] - dp[0]) >= cedge_disc || ::abs(dp[2] - dp[0]) >= cedge_disc || ::abs(dp[3] - dp[0]) >= cedge_disc || ::abs(dp[4] - dp[0]) >= cedge_disc)            
        {
-            return abs(a[0] - b[0]);
-        }
-    };
+            const int ymin = ::max(0, y - cradius);
+            const int xmin = ::max(0, x - cradius);
+            const int ymax = ::min(h - 1, y + cradius);
+            const int xmax = ::min(w - 1, x + cradius);

-    template <int channels, typename T>
-    __global__ void bilateral_filter(int t, T* disp, size_t disp_step, const uchar* img, size_t img_step, int h, int w)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+            float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};

-        T dp[5];
+            const uchar* ic = img + y * img_step + channels * x;

-        if (y > 0 && y < h - 1 && x > 0 && x < w - 1)
-        {
-            dp[0] = *(disp + (y  ) * disp_step + x + 0);
-            dp[1] = *(disp + (y-1) * disp_step + x + 0);
-            dp[2] = *(disp + (y  ) * disp_step + x - 1);
-            dp[3] = *(disp + (y+1) * disp_step + x + 0);
-            dp[4] = *(disp + (y  ) * disp_step + x + 1);
-
-            if(abs(dp[1] - dp[0]) >= cedge_disc || abs(dp[2] - dp[0]) >= cedge_disc || abs(dp[3] - dp[0]) >= cedge_disc || abs(dp[4] - dp[0]) >= cedge_disc)            
+            for(int yi = ymin; yi <= ymax; yi++)
            {
-                const int ymin = max(0, y - cradius);
-                const int xmin = max(0, x - cradius);
-                const int ymax = min(h - 1, y + cradius);
-                const int xmax = min(w - 1, x + cradius);
+                const T* disp_y = disp + yi * disp_step;

-                float cost[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-
-                const uchar* ic = img + y * img_step + channels * x;
-
-                for(int yi = ymin; yi <= ymax; yi++)
+                for(int xi = xmin; xi <= xmax; xi++)
                {
-                    const T* disp_y = disp + yi * disp_step;
+                    const uchar* in = img + yi * img_step + channels * xi;

-                    for(int xi = xmin; xi <= xmax; xi++)
-                    {
-                        const uchar* in = img + yi * img_step + channels * xi;
+                    uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);

-                        uchar dist_rgb = DistRgbMax<channels>::calc(in, ic);
+                    const float weight = ctable_color[dist_rgb] * (ctable_space + ::abs(y-yi)* ctable_space_step)[::abs(x-xi)];

-                        const float weight = ctable_color[dist_rgb] * (ctable_space + abs(y-yi)* ctable_space_step)[abs(x-xi)];
+                    const T disp_reg = disp_y[xi];

-                        const T disp_reg = disp_y[xi];
-
-                        cost[0] += min(cmax_disc, abs(disp_reg - dp[0])) * weight;
-                        cost[1] += min(cmax_disc, abs(disp_reg - dp[1])) * weight;
-                        cost[2] += min(cmax_disc, abs(disp_reg - dp[2])) * weight;
-                        cost[3] += min(cmax_disc, abs(disp_reg - dp[3])) * weight;
-                        cost[4] += min(cmax_disc, abs(disp_reg - dp[4])) * weight;
-                    }
+                    cost[0] += ::min(cmax_disc, ::abs(disp_reg - dp[0])) * weight;
+                    cost[1] += ::min(cmax_disc, ::abs(disp_reg - dp[1])) * weight;
+                    cost[2] += ::min(cmax_disc, ::abs(disp_reg - dp[2])) * weight;
+                    cost[3] += ::min(cmax_disc, ::abs(disp_reg - dp[3])) * weight;
+                    cost[4] += ::min(cmax_disc, ::abs(disp_reg - dp[4])) * weight;
                }
-
-                float minimum = numeric_limits<float>::max();
-                int id = 0;
-
-                if (cost[0] < minimum)
-                {
-                    minimum = cost[0];
-                    id = 0;
-                }
-                if (cost[1] < minimum)
-                {
-                    minimum = cost[1];
-                    id = 1;
-                }
-                if (cost[2] < minimum)
-                {
-                    minimum = cost[2];
-                    id = 2;
-                }
-                if (cost[3] < minimum)
-                {
-                    minimum = cost[3];
-                    id = 3;
-                }
-                if (cost[4] < minimum)
-                {
-                    minimum = cost[4];
-                    id = 4;
-                }
-
-                *(disp + y * disp_step + x) = dp[id];
            }
+
+            float minimum = numeric_limits<float>::max();
+            int id = 0;
+
+            if (cost[0] < minimum)
+            {
+                minimum = cost[0];
+                id = 0;
+            }
+            if (cost[1] < minimum)
+            {
+                minimum = cost[1];
+                id = 1;
+            }
+            if (cost[2] < minimum)
+            {
+                minimum = cost[2];
+                id = 2;
+            }
+            if (cost[3] < minimum)
+            {
+                minimum = cost[3];
+                id = 3;
+            }
+            if (cost[4] < minimum)
+            {
+                minimum = cost[4];
+                id = 4;
+            }
+
+            *(disp + y * disp_step + x) = dp[id];
        }
    }
 }

-namespace cv { namespace gpu { namespace bf 
+template <typename T>     
+void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
 {
-    template <typename T>     
-    void bilateral_filter_caller(const DevMem2D_<T>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-        grid.x = divUp(disp.cols, threads.x << 1);
-        grid.y = divUp(disp.rows, threads.y);
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);
+    grid.x = divUp(disp.cols, threads.x << 1);
+    grid.y = divUp(disp.rows, threads.y);

-        switch (channels)
+    switch (channels)
+    {
+    case 1:
+        for (int i = 0; i < iters; ++i)
        {
-        case 1:
-            for (int i = 0; i < iters; ++i)
-            {
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-                bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            break;
-        case 3:
-            for (int i = 0; i < iters; ++i)
-            {
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-                bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
-                cudaSafeCall( cudaGetLastError() );
-            }
-            break;
-        default:
-            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
+            bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+            cudaSafeCall( cudaGetLastError() );
+
+            bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+            cudaSafeCall( cudaGetLastError() );
        }
+        break;
+    case 3:
+        for (int i = 0; i < iters; ++i)
+        {
+            bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+            cudaSafeCall( cudaGetLastError() );

-        if (stream != 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
+            cudaSafeCall( cudaGetLastError() );
+        }
+        break;
+    default:
+        cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
    }

-    void bilateral_filter_gpu(const DevMem2Db& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
-    {
-        bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
+    if (stream != 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

-    void bilateral_filter_gpu(const DevMem2D_<short>& disp, const DevMem2Db& img, int channels, int iters, cudaStream_t stream)
-    {
-        bilateral_filter_caller(disp, img, channels, iters, stream);
-    }
-}}}
+void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+{
+    bilateral_filter_caller(disp, img, channels, iters, stream);
+}
+
+void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+{
+    bilateral_filter_caller(disp, img, channels, iters, stream);
+}
+
+} // namespace bilateral_filter
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/blend.cu
+++ b/modules/gpu/src/cuda/blend.cu
@ -42,81 +42,77 @@

 #include "internal_shared.hpp"

-using namespace cv::gpu;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu 
+namespace blend {
+
+template <typename T>
+__global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
+                                  const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
 {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;

-    template <typename T>
-    __global__ void blendLinearKernel(int rows, int cols, int cn, const PtrStep<T> img1, const PtrStep<T> img2,
-                                      const PtrStepf weights1, const PtrStepf weights2, PtrStep<T> result)
+    if (y < rows && x < cols)
    {
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y < rows && x < cols)
-        {
-            int x_ = x / cn;
-            float w1 = weights1.ptr(y)[x_];
-            float w2 = weights2.ptr(y)[x_];
-            T p1 = img1.ptr(y)[x];
-            T p2 = img2.ptr(y)[x];
-            result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
-        }
+        int x_ = x / cn;
+        float w1 = weights1.ptr(y)[x_];
+        float w2 = weights2.ptr(y)[x_];
+        T p1 = img1.ptr(y)[x];
+        T p2 = img2.ptr(y)[x];
+        result.ptr(y)[x] = (p1 * w1 + p2 * w2) / (w1 + w2 + 1e-5f);
    }
+}	

-    template <typename T>
-    void blendLinearCaller(int rows, int cols, int cn, const PtrStep<T>& img1, const PtrStep<T>& img2, 
-                           const PtrStepf& weights1, const PtrStepf& weights2, PtrStep<T> result, cudaStream_t stream)
+template <typename T>
+void blendLinearCaller(int rows, int cols, int cn, PtrStep<T> img1, PtrStep<T> img2, PtrStepf weights1, PtrStepf weights2, PtrStep<T> result, cudaStream_t stream)
+{
+    dim3 threads(16, 16);
+    dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
+    
+    blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+template void blendLinearCaller<uchar>(int, int, int, PtrStep<uchar>, PtrStep<uchar>, PtrStepf, PtrStepf, PtrStep<uchar>, cudaStream_t stream);
+template void blendLinearCaller<float>(int, int, int, PtrStep<float>, PtrStep<float>, PtrStepf, PtrStepf, PtrStep<float>, cudaStream_t stream);
+
+
+__global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
+                                      const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
+{
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (y < rows && x < cols)
    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols * cn, threads.x), divUp(rows, threads.y));
-        
-        blendLinearKernel<<<grid, threads, 0, stream>>>(rows, cols * cn, cn, img1, img2, weights1, weights2, result);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
+        float w1 = weights1.ptr(y)[x];
+        float w2 = weights2.ptr(y)[x];
+        float sum_inv = 1.f / (w1 + w2 + 1e-5f);
+        w1 *= sum_inv;
+        w2 *= sum_inv;
+        uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
+        uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
+        ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
+                                                  p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
    }
+}

-    template void blendLinearCaller<uchar>(int, int, int, const PtrStep<uchar>&, const PtrStep<uchar>&, 
-                                           const PtrStepf&, const PtrStepf&, PtrStep<uchar>, cudaStream_t stream);
-    template void blendLinearCaller<float>(int, int, int, const PtrStep<float>&, const PtrStep<float>&, 
-                                           const PtrStepf&, const PtrStepf&, PtrStep<float>, cudaStream_t stream);
+void blendLinearCaller8UC4(int rows, int cols, PtrStepb img1, PtrStepb img2, PtrStepf weights1, PtrStepf weights2, PtrStepb result, cudaStream_t stream)
+{
+    dim3 threads(16, 16);
+    dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
    
+    blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
+    cudaSafeCall( cudaGetLastError() );

-    __global__ void blendLinearKernel8UC4(int rows, int cols, const PtrStepb img1, const PtrStepb img2,
-                                          const PtrStepf weights1, const PtrStepf weights2, PtrStepb result)
-    {
-        int x = blockIdx.x * blockDim.x + threadIdx.x;
-        int y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}

-        if (y < rows && x < cols)
-        {
-            float w1 = weights1.ptr(y)[x];
-            float w2 = weights2.ptr(y)[x];
-            float sum_inv = 1.f / (w1 + w2 + 1e-5f);
-            w1 *= sum_inv;
-            w2 *= sum_inv;
-            uchar4 p1 = ((const uchar4*)img1.ptr(y))[x];
-            uchar4 p2 = ((const uchar4*)img2.ptr(y))[x];
-            ((uchar4*)result.ptr(y))[x] = make_uchar4(p1.x * w1 + p2.x * w2, p1.y * w1 + p2.y * w2,
-                                                      p1.z * w1 + p2.z * w2, p1.w * w1 + p2.w * w2);
-        }
-    }
+} // namespace blend 

-
-    void blendLinearCaller8UC4(int rows, int cols, const PtrStepb& img1, const PtrStepb& img2, 
-                               const PtrStepf& weights1, const PtrStepf& weights2, PtrStepb result, cudaStream_t stream)
-    {
-        dim3 threads(16, 16);
-        dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
-        
-        blendLinearKernel8UC4<<<grid, threads, 0, stream>>>(rows, cols, img1, img2, weights1, weights2, result);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-}}
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -44,153 +44,149 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

+BEGIN_OPENCV_DEVICE_NAMESPACE
+
 #define SOLVE_PNP_RANSAC_MAX_NUM_ITERS 200

-using namespace cv::gpu::device;
-
-namespace cv { namespace gpu
+namespace transform_points
 {
-    namespace transform_points
+    __constant__ float3 crot0;
+    __constant__ float3 crot1;
+    __constant__ float3 crot2;
+    __constant__ float3 ctransl;
+
+    struct TransformOp : unary_function<float3, float3>
    {
-        __constant__ float3 crot0;
-        __constant__ float3 crot1;
-        __constant__ float3 crot2;
-        __constant__ float3 ctransl;
-
-        struct TransformOp : unary_function<float3, float3>
+        __device__ __forceinline__ float3 operator()(const float3& p) const
        {
-            __device__ __forceinline__ float3 operator()(const float3& p) const
-            {
-                return make_float3(
-                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-            }
-        };
-
-        void call(const DevMem2D_<float3> src, const float* rot,
-                  const float* transl, DevMem2D_<float3> dst,
-                  cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            transform(src, dst, TransformOp(), stream);
+            return make_float3(
+                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
        }
-    } // namespace transform_points
+    };

-
-    namespace project_points
+    void call(const DevMem2D_<float3> src, const float* rot,
+              const float* transl, DevMem2D_<float3> dst,
+              cudaStream_t stream)
    {
-        __constant__ float3 crot0;
-        __constant__ float3 crot1;
-        __constant__ float3 crot2;
-        __constant__ float3 ctransl;
-        __constant__ float3 cproj0;
-        __constant__ float3 cproj1;
+        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, TransformOp(), stream);
+    }
+} // namespace transform_points

-        struct ProjectOp : unary_function<float3, float3>
-        {
-            __device__ __forceinline__ float2 operator()(const float3& p) const
-            {
-                // Rotate and translate in 3D
-                float3 t = make_float3(
-                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
-                        crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
-                        crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
-                // Project on 2D plane
-                return make_float2(
-                        (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
-                        (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
-            }
-        };
+namespace project_points
+{
+    __constant__ float3 crot0;
+    __constant__ float3 crot1;
+    __constant__ float3 crot2;
+    __constant__ float3 ctransl;
+    __constant__ float3 cproj0;
+    __constant__ float3 cproj1;

-        void call(const DevMem2D_<float3> src, const float* rot,
-                  const float* transl, const float* proj, DevMem2D_<float2> dst,
-                  cudaStream_t stream)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
-            cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
-            transform(src, dst, ProjectOp(), stream);
-        }
-    } // namespace project_points
-
-
-    namespace solve_pnp_ransac
+    struct ProjectOp : unary_function<float3, float3>
    {
-        __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
-        __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
-
-        int maxNumIters()
+        __device__ __forceinline__ float2 operator()(const float3& p) const
        {
-            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
+            // Rotate and translate in 3D
+            float3 t = make_float3(
+                    crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
+                    crot1.x * p.x + crot1.y * p.y + crot1.z * p.z + ctransl.y,
+                    crot2.x * p.x + crot2.y * p.y + crot2.z * p.z + ctransl.z);
+            // Project on 2D plane
+            return make_float2(
+                    (cproj0.x * t.x + cproj0.y * t.y) / t.z + cproj0.z,
+                    (cproj1.x * t.x + cproj1.y * t.y) / t.z + cproj1.z);
+        }
+    };
+
+    void call(const DevMem2D_<float3> src, const float* rot,
+              const float* transl, const float* proj, DevMem2D_<float2> dst,
+              cudaStream_t stream)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot1, rot + 3, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(crot2, rot + 6, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(ctransl, transl, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(cproj0, proj, sizeof(float) * 3));
+        cudaSafeCall(cudaMemcpyToSymbol(cproj1, proj + 3, sizeof(float) * 3));
+        OPENCV_DEVICE_NAMESPACE_ transform(src, dst, ProjectOp(), stream);
+    }
+} // namespace project_points
+
+namespace solve_pnp_ransac
+{
+    __constant__ float3 crot_matrices[SOLVE_PNP_RANSAC_MAX_NUM_ITERS * 3];
+    __constant__ float3 ctransl_vectors[SOLVE_PNP_RANSAC_MAX_NUM_ITERS];
+
+    int maxNumIters()
+    {
+        return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
+    }
+
+    __device__ __forceinline__ float sqr(float x)
+    {
+        return x * x;
+    }
+
+    __global__ void computeHypothesisScoresKernel(
+            const int num_points, const float3* object, const float2* image,
+            const float dist_threshold, int* g_num_inliers)
+    {
+        const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
+        const float3 &transl_vec = ctransl_vectors[blockIdx.x];
+        int num_inliers = 0;
+
+        for (int i = threadIdx.x; i < num_points; i += blockDim.x)
+        {
+            float3 p = object[i];
+            p = make_float3(
+                    rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
+                    rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
+                    rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
+            p.x /= p.z;
+            p.y /= p.z;
+            float2 image_p = image[i];
+            if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
+                ++num_inliers;
        }

-        __device__ __forceinline__ float sqr(float x)
+        extern __shared__ float s_num_inliers[];
+        s_num_inliers[threadIdx.x] = num_inliers;
+        __syncthreads();
+
+        for (int step = blockDim.x / 2; step > 0; step >>= 1)
        {
-            return x * x;
-        }
-
-        __global__ void computeHypothesisScoresKernel(
-                const int num_points, const float3* object, const float2* image,
-                const float dist_threshold, int* g_num_inliers)
-        {
-            const float3* const &rot_mat = crot_matrices + blockIdx.x * 3;
-            const float3 &transl_vec = ctransl_vectors[blockIdx.x];
-            int num_inliers = 0;
-
-            for (int i = threadIdx.x; i < num_points; i += blockDim.x)
-            {
-                float3 p = object[i];
-                p = make_float3(
-                        rot_mat[0].x * p.x + rot_mat[0].y * p.y + rot_mat[0].z * p.z + transl_vec.x,
-                        rot_mat[1].x * p.x + rot_mat[1].y * p.y + rot_mat[1].z * p.z + transl_vec.y,
-                        rot_mat[2].x * p.x + rot_mat[2].y * p.y + rot_mat[2].z * p.z + transl_vec.z);
-                p.x /= p.z;
-                p.y /= p.z;
-                float2 image_p = image[i];
-                if (sqr(p.x - image_p.x) + sqr(p.y - image_p.y) < dist_threshold)
-                    ++num_inliers;
-            }
-
-            extern __shared__ float s_num_inliers[];
-            s_num_inliers[threadIdx.x] = num_inliers;
+            if (threadIdx.x < step)
+                s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
            __syncthreads();
-
-            for (int step = blockDim.x / 2; step > 0; step >>= 1)
-            {
-                if (threadIdx.x < step)
-                    s_num_inliers[threadIdx.x] += s_num_inliers[threadIdx.x + step];
-                __syncthreads();
-            }
-
-            if (threadIdx.x == 0)
-                g_num_inliers[blockIdx.x] = s_num_inliers[0];
        }

-        void computeHypothesisScores(
-                const int num_hypotheses, const int num_points, const float* rot_matrices,
-                const float3* transl_vectors, const float3* object, const float2* image,
-                const float dist_threshold, int* hypothesis_scores)
-        {
-            cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
-            cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));
+        if (threadIdx.x == 0)
+            g_num_inliers[blockIdx.x] = s_num_inliers[0];
+    }

-            dim3 threads(256);
-            dim3 grid(num_hypotheses);
-            int smem_size = threads.x * sizeof(float);
+    void computeHypothesisScores(
+            const int num_hypotheses, const int num_points, const float* rot_matrices,
+            const float3* transl_vectors, const float3* object, const float2* image,
+            const float dist_threshold, int* hypothesis_scores)
+    {
+        cudaSafeCall(cudaMemcpyToSymbol(crot_matrices, rot_matrices, num_hypotheses * 3 * sizeof(float3)));
+        cudaSafeCall(cudaMemcpyToSymbol(ctransl_vectors, transl_vectors, num_hypotheses * sizeof(float3)));

-            computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
-                    num_points, object, image, dist_threshold, hypothesis_scores);
-            cudaSafeCall( cudaGetLastError() );
+        dim3 threads(256);
+        dim3 grid(num_hypotheses);
+        int smem_size = threads.x * sizeof(float);

-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    } // namespace solvepnp_ransac
+        computeHypothesisScoresKernel<<<grid, threads, smem_size>>>(
+                num_points, object, image, dist_threshold, hypothesis_scores);
+        cudaSafeCall( cudaGetLastError() );

-}} // namespace cv { namespace gpu
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+} // namespace solvepnp_ransac
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@ -44,339 +44,370 @@
 #include <algorithm>
 #include "internal_shared.hpp"

-using namespace cv::gpu;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace canny
+namespace canny {
+
+__global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
 {
-    __global__ void calcSobelRowPass(const PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+    __shared__ int smem[16][18];
+
+    const int j = blockIdx.x * blockDim.x + threadIdx.x;
+    const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < rows)
    {
-        __shared__ int smem[16][18];
-
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (i < rows)
+        smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
+        if (threadIdx.x == 0)
        {
-            smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
-            if (threadIdx.x == 0)
-            {
-                smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)];
-                smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)];
-            }
-            __syncthreads();
-
-            if (j < cols)
-            {
-                dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
-                dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
-            }
+            smem[threadIdx.y][0] = src.ptr(i)[::max(j - 1, 0)];
+            smem[threadIdx.y][17] = src.ptr(i)[::min(j + 16, cols - 1)];
        }
-    }
-
-    void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
-    {
-        dim3 block(16, 16, 1);
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-        calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall(cudaThreadSynchronize());
-    }
-
-    struct L1
-    {
-        static __device__ __forceinline__ float calc(int x, int y)
-        {
-            return abs(x) + abs(y);
-        }
-    };
-    struct L2
-    {
-        static __device__ __forceinline__ float calc(int x, int y)
-        {
-            return sqrtf(x * x + y * y);
-        }
-    };
-
-    template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
-        PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
-    {
-        __shared__ int sdx[18][16];
-        __shared__ int sdy[18][16];
-
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
+        __syncthreads();

        if (j < cols)
        {
-            sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
-            sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
-            if (threadIdx.y == 0)
-            {
-                sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j];
-                sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j];
-
-                sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j];
-                sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j];
-            }
-            __syncthreads();
-
-            if (i < rows)
-            {
-                int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
-                int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];
-
-                dx.ptr(i)[j] = x;
-                dy.ptr(i)[j] = y;
-
-                mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
-            }
+            dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
+            dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
        }
    }
+}

-    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
+{
+    dim3 block(16, 16, 1);
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+    calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
+    cudaSafeCall( cudaGetLastError() );
+
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+struct L1
+{
+    static __device__ __forceinline__ float calc(int x, int y)
    {
-        dim3 block(16, 16, 1);
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-        if (L2Grad)
-            calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-        else
-            calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        cudaSafeCall(cudaThreadSynchronize());
+        return ::abs(x) + ::abs(y);
    }
-
-    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+};
+struct L2
+{
+    static __device__ __forceinline__ float calc(int x, int y)
    {
-        const int j = blockIdx.x * blockDim.x + threadIdx.x;
-        const int i = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (i < rows && j < cols)
-            mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+        return ::sqrtf(x * x + y * y);
    }
+};

-    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+template <typename Norm> __global__ void calcMagnitude(const PtrStepi dx_buf, const PtrStepi dy_buf, 
+    PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+{
+    __shared__ int sdx[18][16];
+    __shared__ int sdy[18][16];
+
+    const int j = blockIdx.x * blockDim.x + threadIdx.x;
+    const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (j < cols)
    {
-        dim3 block(16, 16, 1);
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
+        sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
+        if (threadIdx.y == 0)
+        {
+            sdx[0][threadIdx.x] = dx_buf.ptr(::max(i - 1, 0))[j];
+            sdx[17][threadIdx.x] = dx_buf.ptr(::min(i + 16, rows - 1))[j];

-        if (L2Grad)
-            calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
-        else
-            calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+            sdy[0][threadIdx.x] = dy_buf.ptr(::max(i - 1, 0))[j];
+            sdy[17][threadIdx.x] = dy_buf.ptr(::min(i + 16, rows - 1))[j];
+        }
+        __syncthreads();

-        cudaSafeCall( cudaGetLastError() );
+        if (i < rows)
+        {
+            int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
+            int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];

-        cudaSafeCall(cudaThreadSynchronize());
+            dx.ptr(i)[j] = x;
+            dy.ptr(i)[j] = y;
+
+            mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
+        }
    }
+}
+
+void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+{
+    dim3 block(16, 16, 1);
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+    if (L2Grad)
+        calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+    else
+        calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
+
+    cudaSafeCall( cudaGetLastError() );
+
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
+{
+    const int j = blockIdx.x * blockDim.x + threadIdx.x;
+    const int i = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i < rows && j < cols)
+        mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
+}
+
+void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
+{
+    dim3 block(16, 16, 1);
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+    if (L2Grad)
+        calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
+    else
+        calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);
+
+    cudaSafeCall( cudaGetLastError() );
+
+    cudaSafeCall(cudaThreadSynchronize());
+}

 //////////////////////////////////////////////////////////////////////////////////////////
    
 #define CANNY_SHIFT 15
 #define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)

-    __global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+__global__ void calcMap(const PtrStepi dx, const PtrStepi dy, const PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+{
+    __shared__ float smem[18][18];
+
+    const int j = blockIdx.x * 16 + threadIdx.x;
+    const int i = blockIdx.y * 16 + threadIdx.y;
+
+    const int tid = threadIdx.y * 16 + threadIdx.x;
+    const int lx = tid % 18;
+    const int ly = tid / 18;
+
+    if (ly < 14)
+        smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+        smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
+    __syncthreads();
+
+    if (i < rows && j < cols)
    {
-        __shared__ float smem[18][18];
+        int x = dx.ptr(i)[j];
+        int y = dy.ptr(i)[j];
+        const int s = (x ^ y) < 0 ? -1 : 1;
+        const float m = smem[threadIdx.y + 1][threadIdx.x + 1];

-        const int j = blockIdx.x * 16 + threadIdx.x;
-        const int i = blockIdx.y * 16 + threadIdx.y;
+        x = ::abs(x);
+        y = ::abs(y);

-        const int tid = threadIdx.y * 16 + threadIdx.x;
-        const int lx = tid % 18;
-        const int ly = tid / 18;
+        // 0 - the pixel can not belong to an edge
+        // 1 - the pixel might belong to an edge
+        // 2 - the pixel does belong to an edge
+        int edge_type = 0;

-        if (ly < 14)
-            smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-            smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-        __syncthreads();
-
-        if (i < rows && j < cols)
+        if (m > low_thresh)
        {
-            int x = dx.ptr(i)[j];
-            int y = dy.ptr(i)[j];
-            const int s = (x ^ y) < 0 ? -1 : 1;
-            const float m = smem[threadIdx.y + 1][threadIdx.x + 1];
+            const int tg22x = x * TG22;
+            const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);

-            x = abs(x);
-            y = abs(y);
+            y <<= CANNY_SHIFT;

-            // 0 - the pixel can not belong to an edge
-            // 1 - the pixel might belong to an edge
-            // 2 - the pixel does belong to an edge
-            int edge_type = 0;
-
-            if (m > low_thresh)
+            if (y < tg22x)
            {
-                const int tg22x = x * TG22;
-                const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);
-
-                y <<= CANNY_SHIFT;
-
-                if (y < tg22x)
-                {
-                    if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
-                        edge_type = 1 + (int)(m > high_thresh);
-                }
-                else if( y > tg67x )
-                {
-                    if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
-                        edge_type = 1 + (int)(m > high_thresh);
-                }
-                else
-                {
-                    if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
-                        edge_type = 1 + (int)(m > high_thresh);
-                }
+                if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else if( y > tg67x )
+            {
+                if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
+                    edge_type = 1 + (int)(m > high_thresh);
+            }
+            else
+            {
+                if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
+                    edge_type = 1 + (int)(m > high_thresh);
            }
-            
-            map.ptr(i + 1)[j + 1] = edge_type;
        }
+        
+        map.ptr(i + 1)[j + 1] = edge_type;
    }
+}

 #undef CANNY_SHIFT
 #undef TG22

-    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
-    {
-        dim3 block(16, 16, 1);
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
+{
+    dim3 block(16, 16, 1);
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

-        calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
-        cudaSafeCall( cudaGetLastError() );
+    calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
+    cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall(cudaThreadSynchronize());
-    }
+    cudaSafeCall(cudaThreadSynchronize());
+}

 //////////////////////////////////////////////////////////////////////////////////////////

-    __device__ unsigned int counter = 0;
+__device__ unsigned int counter = 0;

-    __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+__global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
+{
+    #if __CUDA_ARCH__ >= 120
+
+    __shared__ int smem[18][18];
+
+    const int j = blockIdx.x * 16 + threadIdx.x;
+    const int i = blockIdx.y * 16 + threadIdx.y;
+
+    const int tid = threadIdx.y * 16 + threadIdx.x;
+    const int lx = tid % 18;
+    const int ly = tid / 18; 
+
+    if (ly < 14)
+        smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
+
+    if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
+        smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
+
+    __syncthreads();
+
+    if (i < rows && j < cols)
    {
-        #if __CUDA_ARCH__ >= 120
+        int n;

-        __shared__ int smem[18][18];
-
-        const int j = blockIdx.x * 16 + threadIdx.x;
-        const int i = blockIdx.y * 16 + threadIdx.y;
-
-        const int tid = threadIdx.y * 16 + threadIdx.x;
-        const int lx = tid % 18;
-        const int ly = tid / 18; 
-
-        if (ly < 14)
-            smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];
-
-        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
-            smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];
-
-        __syncthreads();
-
-        if (i < rows && j < cols)
+        #pragma unroll
+        for (int k = 0; k < 16; ++k)
        {
-            int n;
-
-            #pragma unroll
-            for (int k = 0; k < 16; ++k)
-            {
-                n = 0;
-
-                if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
-                {
-                    n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
-                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
-                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
-                    
-                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
-                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
-                    
-                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
-                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
-                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
-                }
-
-                if (n > 0)
-                    smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
-            }
-
-            const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
-
-            map.ptr(i + 1)[j + 1] = e;
-
            n = 0;

-            if (e == 2)
+            if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
            {
-                n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
-                n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
-                n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
+                n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
                
-                n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
-                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+                n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
                
-                n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
-                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
-                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+                n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
+                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
            }

            if (n > 0)
-            {
-                const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
-                st[ind] = make_ushort2(j + 1, i + 1);
-            }
+                smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
        }

-        #endif
-    }
+        const int e = smem[threadIdx.y + 1][threadIdx.x + 1];

-    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
-    {
-        dim3 block(16, 16, 1);
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+        map.ptr(i + 1)[j + 1] = e;

-        edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
-        cudaSafeCall( cudaGetLastError() );
+        n = 0;

-        cudaSafeCall(cudaThreadSynchronize());
-    }
-
-    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
-    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
-
-    __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
-    {
-        #if __CUDA_ARCH__ >= 120
-
-        const int stack_size = 512;
-        
-        __shared__ unsigned int s_counter;
-        __shared__ unsigned int s_ind;
-        __shared__ ushort2 s_st[stack_size];
-
-        if (threadIdx.x == 0)
-            s_counter = 0;
-        __syncthreads();
-
-        int ind = blockIdx.y * gridDim.x + blockIdx.x;
-
-        if (ind < count)
+        if (e == 2)
        {
-            ushort2 pos = st1[ind];
+            n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
            
-            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+            n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
+            
+            n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
+            n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
+        }
+
+        if (n > 0)
+        {
+            const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
+            st[ind] = make_ushort2(j + 1, i + 1);
+        }
+    }
+
+    #endif
+}
+
+void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
+{
+    dim3 block(16, 16, 1);
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+    edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
+    cudaSafeCall( cudaGetLastError() );
+
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+__constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
+__constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
+
+__global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
+{
+    #if __CUDA_ARCH__ >= 120
+
+    const int stack_size = 512;
+    
+    __shared__ unsigned int s_counter;
+    __shared__ unsigned int s_ind;
+    __shared__ ushort2 s_st[stack_size];
+
+    if (threadIdx.x == 0)
+        s_counter = 0;
+    __syncthreads();
+
+    int ind = blockIdx.y * gridDim.x + blockIdx.x;
+
+    if (ind < count)
+    {
+        ushort2 pos = st1[ind];
+
+        if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+        {
+            if (threadIdx.x < 8)
            {
-                if (threadIdx.x < 8)
+                pos.x += c_dx[threadIdx.x];
+                pos.y += c_dy[threadIdx.x];
+
+                if (map.ptr(pos.y)[pos.x] == 1)
                {
-                    pos.x += c_dx[threadIdx.x];
-                    pos.y += c_dy[threadIdx.x];
+                    map.ptr(pos.y)[pos.x] = 2;
+
+                    ind = atomicInc(&s_counter, (unsigned int)(-1));
+
+                    s_st[ind] = pos;
+                }
+            }
+            __syncthreads();
+
+            while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+            {
+                const int subTaskIdx = threadIdx.x >> 3;
+                const int portion = ::min(s_counter, blockDim.x >> 3);
+
+                pos.x = pos.y = 0;
+
+                if (subTaskIdx < portion)
+                    pos = s_st[s_counter - 1 - subTaskIdx];
+                __syncthreads();
+                    
+                if (threadIdx.x == 0)
+                    s_counter -= portion;
+                __syncthreads();
+                 
+                if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
+                {
+                    pos.x += c_dx[threadIdx.x & 7];
+                    pos.y += c_dy[threadIdx.x & 7];

                    if (map.ptr(pos.y)[pos.x] == 1)
                    {
@ -388,103 +419,75 @@ namespace cv { namespace gpu { namespace canny
                    }
                }
                __syncthreads();
+            }

-                while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
+            if (s_counter > 0)
+            {
+                if (threadIdx.x == 0)
                {
-                    const int subTaskIdx = threadIdx.x >> 3;
-                    const int portion = min(s_counter, blockDim.x >> 3);
-
-                    pos.x = pos.y = 0;
-
-                    if (subTaskIdx < portion)
-                        pos = s_st[s_counter - 1 - subTaskIdx];
-                    __syncthreads();
-                        
-                    if (threadIdx.x == 0)
-                        s_counter -= portion;
-                    __syncthreads();
-                     
-                    if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
-                    {
-                        pos.x += c_dx[threadIdx.x & 7];
-                        pos.y += c_dy[threadIdx.x & 7];
-
-                        if (map.ptr(pos.y)[pos.x] == 1)
-                        {
-                            map.ptr(pos.y)[pos.x] = 2;
-
-                            ind = atomicInc(&s_counter, (unsigned int)(-1));
-
-                            s_st[ind] = pos;
-                        }
-                    }
-                    __syncthreads();
+                    ind = atomicAdd(&counter, s_counter);
+                    s_ind = ind - s_counter;
                }
+                __syncthreads();

-                if (s_counter > 0)
+                ind = s_ind;
+
+                for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
                {
-                    if (threadIdx.x == 0)
-                    {
-                        ind = atomicAdd(&counter, s_counter);
-                        s_ind = ind - s_counter;
-                    }
-                    __syncthreads();
-
-                    ind = s_ind;
-
-                    for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
-                    {
-                        st2[ind + i] = s_st[i];
-                    }
+                    st2[ind + i] = s_st[i];
                }
            }
        }
-
-        #endif
    }

-    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+    #endif
+}
+
+void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
+{
+    void* counter_ptr;
+    cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
+    
+    unsigned int count;
+    cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+    while (count > 0)
    {
-        void* counter_ptr;
-        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") );
+        cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );

-        unsigned int count;
-        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
-        while (count > 0)
-        {
-            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
-
-            dim3 block(128, 1, 1);
-            dim3 grid(min(count, 65535u), divUp(count, 65535), 1);
-            edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall(cudaThreadSynchronize());
-
-            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
-
-            std::swap(st1, st2);
-        }
-    }
-
-    __global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
-    {
-        const int j = blockIdx.x * 16 + threadIdx.x;
-        const int i = blockIdx.y * 16 + threadIdx.y;
-
-        if (i < rows && j < cols)
-            dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
-    }
-
-    void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
-    {
-        dim3 block(16, 16, 1);
-        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
-
-        getEdges<<<grid, block>>>(map, dst, rows, cols);
+        dim3 block(128, 1, 1);
+        dim3 grid(min(count, 65535u), divUp(count, 65535), 1);
+        edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
+
+        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
+
+        std::swap(st1, st2);
    }
-}}}
+}
+
+__global__ void getEdges(PtrStepi map, PtrStepb dst, int rows, int cols)
+{
+    const int j = blockIdx.x * 16 + threadIdx.x;
+    const int i = blockIdx.y * 16 + threadIdx.y;
+
+    if (i < rows && j < cols)
+        dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
+}
+
+void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols)
+{
+    dim3 block(16, 16, 1);
+    dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);
+
+    getEdges<<<grid, block>>>(map, dst, rows, cols);
+    cudaSafeCall( cudaGetLastError() );
+
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+} // namespace canny
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -44,336 +44,337 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/color.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_rgba_traits<uchar>::functor_type)
 {
-    template <> struct TransformFunctorTraits<bgra_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_rgba_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_x = 8 };
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+    enum { smart_block_dim_x = 8 };
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<bgra_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<bgra_to_bgr555_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<rgba_to_bgr555_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<bgra_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<bgra_to_bgr565_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<rgba_to_bgr565_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr555_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr555_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_bgr565_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_bgr565_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<bgr555_to_bgra_traits::functor_type> : DefaultTransformFunctorTraits<bgr555_to_bgra_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<bgr555_to_rgba_traits::functor_type> : DefaultTransformFunctorTraits<bgr555_to_rgba_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<bgr565_to_bgra_traits::functor_type> : DefaultTransformFunctorTraits<bgr565_to_bgra_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<bgr565_to_rgba_traits::functor_type> : DefaultTransformFunctorTraits<bgr565_to_rgba_traits::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_bgra_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr555_to_rgba_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_bgra_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgr565_to_rgba_traits::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<gray_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<gray_to_bgra_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgra_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<gray_to_bgr555_traits::functor_type> : DefaultTransformFunctorTraits<gray_to_bgr555_traits::functor_type>
-    {
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<gray_to_bgr565_traits::functor_type> : DefaultTransformFunctorTraits<gray_to_bgr565_traits::functor_type>
-    {
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr555_traits::functor_type)
+{
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(gray_to_bgr565_traits::functor_type)
+{
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<bgra_to_yuv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_yuv4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_yuv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_yuv4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_yuv4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_yuv4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<yuv4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<yuv4_to_bgra_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<yuv4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<yuv4_to_rgba_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_bgra_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(yuv4_to_rgba_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<bgra_to_YCrCb4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_YCrCb4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_YCrCb4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_YCrCb4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_YCrCb4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_YCrCb4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<YCrCb4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<YCrCb4_to_bgra_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<YCrCb4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<YCrCb4_to_rgba_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };    
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_bgra_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(YCrCb4_to_rgba_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};    

-    template <> struct TransformFunctorTraits<bgra_to_xyz4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_xyz4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_xyz4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_xyz4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_xyz4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_xyz4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<xyz4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<xyz4_to_bgra_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<xyz4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<xyz4_to_rgba_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_bgra_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(xyz4_to_rgba_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<bgra_to_hsv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_hsv4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_hsv4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_hsv4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hsv4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hsv4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<hsv4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hsv4_to_bgra_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<hsv4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hsv4_to_rgba_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_bgra_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hsv4_to_rgba_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<bgra_to_hls4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<bgra_to_hls4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<rgba_to_hls4_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<rgba_to_hls4_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(bgra_to_hls4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(rgba_to_hls4_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    template <> struct TransformFunctorTraits<hls4_to_bgra_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hls4_to_bgra_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
-    template <> struct TransformFunctorTraits<hls4_to_rgba_traits<uchar>::functor_type> : DefaultTransformFunctorTraits<hls4_to_rgba_traits<uchar>::functor_type>
-    {
-        enum { smart_block_dim_y = 8 };
-        enum { smart_shift = 4 };
-    };
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_bgra_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};
+DEFINE_TRANSFORM_FUNCTOR_TRAITS(hls4_to_rgba_traits<uchar>::functor_type)
+{
+    enum { smart_block_dim_y = 8 };
+    enum { smart_shift = 4 };
+};

-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-        void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
-        { \
-            traits::functor_type functor = traits::create_functor(); \
-            typedef typename traits::functor_type::argument_type src_t; \
-            typedef typename traits::functor_type::result_type   dst_t; \
-            transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
-        }
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
+    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
+    { \
+        traits::functor_type functor = traits::create_functor(); \
+        typedef typename traits::functor_type::argument_type src_t; \
+        typedef typename traits::functor_type::result_type   dst_t; \
+        OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, stream); \
+    }

-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, name ## _traits)

-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _16u, name ## _traits<ushort>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>)

-    #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
-        OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)
+#define OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(name) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _8u, name ## _traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _32f, name ## _traits<float>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_8u, name ## _full_traits<uchar>) \
+    OPENCV_GPU_IMPLEMENT_CVTCOLOR(name ## _full_32f, name ## _full_traits<float>)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_rgba)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr555)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr_to_bgr565)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr555)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgb_to_bgr565)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr555)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgra_to_bgr565)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr555)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(rgba_to_bgr565)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(gray_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr555)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(gray_to_bgr565)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr555_to_gray)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(bgr565_to_gray)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_gray)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_gray)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_gray)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_gray)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_yuv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_yuv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_yuv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_yuv4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(yuv4_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_YCrCb4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_YCrCb4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_YCrCb4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_YCrCb4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(YCrCb4_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgb_to_xyz4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(rgba_to_xyz4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgr_to_xyz4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(bgra_to_xyz4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL(xyz4_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hsv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hsv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hsv4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hsv4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hsv4_to_bgra)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgb_to_hls4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(rgba_to_hls4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgr_to_hls4)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(bgra_to_hls4)

-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
-    OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgb)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_rgba)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls_to_bgra)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgr)
+OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F(hls4_to_bgra)

-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
-    #undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
-}}}
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_ALL
+#undef OPENCV_GPU_IMPLEMENT_CVTCOLOR_8U32F
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@ -47,8 +47,7 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

 #define MAX_KERNEL_SIZE 16
 #define BLOCK_DIM_X 16
@ -56,195 +55,195 @@ using namespace cv::gpu::device;
 #define RESULT_STEPS 8
 #define HALO_STEPS 1

-namespace filter_column
+namespace column_filter {
+
+__constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+void loadKernel(const float kernel[], int ksize)
 {
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
+}

-    void loadKernel(const float kernel[], int ksize)
+template <int KERNEL_SIZE, typename T, typename D, typename B>
+__global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+{
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+
+    __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
+
+    //Offset to the upper halo edge
+    const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
+    const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
+
+    if (x < src.cols)
    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-    }
+        const T* src_col = src.ptr() + x;

-    template <int KERNEL_SIZE, typename T, typename D, typename B>
-    __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
-    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+        //Main data
+        #pragma unroll
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);

-        __shared__ T smem[BLOCK_DIM_X][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_Y + 1];
+        //Upper halo
+        #pragma unroll
+        for(int i = 0; i < HALO_STEPS; ++i)
+            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);

-        //Offset to the upper halo edge
-        const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
-        const int y = (blockIdx.y * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_Y + threadIdx.y;
+        //Lower halo
+        #pragma unroll
+        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
+            smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);

-        if (x < src.cols)
+        __syncthreads();
+
+        #pragma unroll
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
        {
-            const T* src_col = src.ptr() + x;
-
-            //Main data
-            #pragma unroll
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-                smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
-
-            //Upper halo
-            #pragma unroll
-            for(int i = 0; i < HALO_STEPS; ++i)
-                smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y] = b.at_low(y + i * BLOCK_DIM_Y, src_col, src.step);
-
-            //Lower halo
-            #pragma unroll
-            for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
-                smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y]=  b.at_high(y + i * BLOCK_DIM_Y, src_col, src.step);
-
-            __syncthreads();
+            sum_t sum = VecTraits<sum_t>::all(0);

            #pragma unroll
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-            {
-                sum_t sum = VecTraits<sum_t>::all(0);
+            for(int j = 0; j < KERNEL_SIZE; ++j)
+                sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];

-                #pragma unroll
-                for(int j = 0; j < KERNEL_SIZE; ++j)
-                    sum = sum + smem[threadIdx.x][threadIdx.y + i * BLOCK_DIM_Y + j - anchor] * c_kernel[j];
+            int dstY = y + i * BLOCK_DIM_Y;

-                int dstY = y + i * BLOCK_DIM_Y;
-
-                if (dstY < src.rows)
-                    dst.ptr(dstY)[x] = saturate_cast<D>(sum);
-            }
+            if (dstY < src.rows)
+                dst.ptr(dstY)[x] = saturate_cast<D>(sum);
        }
    }
 }

-namespace cv { namespace gpu { namespace filters
+template <int ksize, typename T, typename D, template<typename> class B>
+void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
 {        
-    template <int ksize, typename T, typename D, template<typename> class B>
-    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+    const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
+
+    B<T> b(src.rows);
+
+    linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+template <typename T, typename D>
+void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+    static const caller_t callers[5][17] = 
    {
-        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-        const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, RESULT_STEPS * BLOCK_DIM_Y));
-
-        B<T> b(src.rows);
-
-        filter_column::linearColumnFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename T, typename D>
-    void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-        static const caller_t callers[5][17] = 
        {
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
-                linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
-                linearColumnFilter_caller<16, T, D, BrdColReflect101> 
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
-                linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
-                linearColumnFilter_caller<16, T, D, BrdColReplicate>
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<2 , T, D, BrdColConstant>,
-                linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
-                linearColumnFilter_caller<10, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<11, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<12, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<13, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<14, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<15, T, D, BrdColConstant>, 
-                linearColumnFilter_caller<16, T, D, BrdColConstant> 
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<2 , T, D, BrdColReflect>,
-                linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
-                linearColumnFilter_caller<10, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<11, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<12, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<13, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<14, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<15, T, D, BrdColReflect>, 
-                linearColumnFilter_caller<16, T, D, BrdColReflect>
-            },
-            {
-                0, 
-                linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<2 , T, D, BrdColWrap>,
-                linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
-                linearColumnFilter_caller<10, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<11, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<12, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<13, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<14, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<15, T, D, BrdColWrap>, 
-                linearColumnFilter_caller<16, T, D, BrdColWrap>,
-            }
-        };
+            0, 
+            linearColumnFilter_caller<1 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<2 , T, D, BrdColReflect101>,
+            linearColumnFilter_caller<3 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<4 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<5 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<6 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<7 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<8 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<9 , T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<10, T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<11, T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<12, T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<13, T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<14, T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<15, T, D, BrdColReflect101>, 
+            linearColumnFilter_caller<16, T, D, BrdColReflect101> 
+        },
+        {
+            0, 
+            linearColumnFilter_caller<1 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<2 , T, D, BrdColReplicate>,
+            linearColumnFilter_caller<3 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<4 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<5 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<6 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<7 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<8 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<9 , T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<10, T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<11, T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<12, T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<13, T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<14, T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<15, T, D, BrdColReplicate>, 
+            linearColumnFilter_caller<16, T, D, BrdColReplicate>
+        },
+        {
+            0, 
+            linearColumnFilter_caller<1 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<2 , T, D, BrdColConstant>,
+            linearColumnFilter_caller<3 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<4 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<5 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<6 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<7 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<8 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<9 , T, D, BrdColConstant>, 
+            linearColumnFilter_caller<10, T, D, BrdColConstant>, 
+            linearColumnFilter_caller<11, T, D, BrdColConstant>, 
+            linearColumnFilter_caller<12, T, D, BrdColConstant>, 
+            linearColumnFilter_caller<13, T, D, BrdColConstant>, 
+            linearColumnFilter_caller<14, T, D, BrdColConstant>, 
+            linearColumnFilter_caller<15, T, D, BrdColConstant>, 
+            linearColumnFilter_caller<16, T, D, BrdColConstant> 
+        },
+        {
+            0, 
+            linearColumnFilter_caller<1 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<2 , T, D, BrdColReflect>,
+            linearColumnFilter_caller<3 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<4 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<5 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<6 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<7 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<8 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<9 , T, D, BrdColReflect>, 
+            linearColumnFilter_caller<10, T, D, BrdColReflect>, 
+            linearColumnFilter_caller<11, T, D, BrdColReflect>, 
+            linearColumnFilter_caller<12, T, D, BrdColReflect>, 
+            linearColumnFilter_caller<13, T, D, BrdColReflect>, 
+            linearColumnFilter_caller<14, T, D, BrdColReflect>, 
+            linearColumnFilter_caller<15, T, D, BrdColReflect>, 
+            linearColumnFilter_caller<16, T, D, BrdColReflect>
+        },
+        {
+            0, 
+            linearColumnFilter_caller<1 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<2 , T, D, BrdColWrap>,
+            linearColumnFilter_caller<3 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<4 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<5 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<6 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<7 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<8 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<9 , T, D, BrdColWrap>, 
+            linearColumnFilter_caller<10, T, D, BrdColWrap>, 
+            linearColumnFilter_caller<11, T, D, BrdColWrap>, 
+            linearColumnFilter_caller<12, T, D, BrdColWrap>, 
+            linearColumnFilter_caller<13, T, D, BrdColWrap>, 
+            linearColumnFilter_caller<14, T, D, BrdColWrap>, 
+            linearColumnFilter_caller<15, T, D, BrdColWrap>, 
+            linearColumnFilter_caller<16, T, D, BrdColWrap>,
+        }
+    };
    
-        filter_column::loadKernel(kernel, ksize);
+    loadKernel(kernel, ksize);

-        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-    }
+    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+}

-    template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    //template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    //template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}}}
+template void linearColumnFilter_gpu<float , uchar >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearColumnFilter_gpu<float4, uchar4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+//template void linearColumnFilter_gpu<float , short >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+//template void linearColumnFilter_gpu<float2, short2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearColumnFilter_gpu<float3, short3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearColumnFilter_gpu<float , int   >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearColumnFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+
+} // namespace column_filter
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
@ -43,85 +43,87 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace imgproc
+namespace copy_make_border {
+
+template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
 {
-    template <typename Ptr2D, typename T> __global__ void copyMakeBorder(const Ptr2D src, DevMem2D_<T> dst, int top, int left)
-    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;

-        if (x < dst.cols && y < dst.rows)
-            dst.ptr(y)[x] = src(y - top, x - left);
+    if (x < dst.cols && y < dst.rows)
+        dst.ptr(y)[x] = src(y - top, x - left);
+}
+
+template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+{
+    static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
+        const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
+    {        
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+        B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
+        BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
+
+        copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
+        cudaSafeCall( cudaGetLastError() );
+
+        if (stream == 0)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
+};

-    template <template <typename> class B, typename T> struct CopyMakeBorderDispatcher
+template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
+    const T* borderValue, cudaStream_t stream)
+{
+    typedef typename TypeVec<T, cn>::vec_type vec_type;
+
+    typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+
+    static const caller_t callers[5] = 
    {
-        static void call(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, int top, int left, 
-            const typename VecTraits<T>::elem_type* borderValue, cudaStream_t stream)
-        {        
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-            B<T> brd(src.rows, src.cols, VecTraits<T>::make(borderValue));
-            BorderReader< PtrStep<T>, B<T> > brdSrc(src, brd);
-
-            copyMakeBorder<<<grid, block, 0, stream>>>(brdSrc, dst, top, left);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
+        CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
+        CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
+        CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
+        CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
    };

-    template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, 
-        const T* borderValue, cudaStream_t stream)
-    {
-        typedef typename TypeVec<T, cn>::vec_type vec_type;
+    callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
+}

-        typedef void (*caller_t)(const DevMem2D_<vec_type>& src, const DevMem2D_<vec_type>& dst, int top, int left, const T* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);

-        static const caller_t callers[5] = 
-        {
-            CopyMakeBorderDispatcher<BrdReflect101, vec_type>::call, 
-            CopyMakeBorderDispatcher<BrdReplicate, vec_type>::call, 
-            CopyMakeBorderDispatcher<BrdConstant, vec_type>::call, 
-            CopyMakeBorderDispatcher<BrdReflect, vec_type>::call, 
-            CopyMakeBorderDispatcher<BrdWrap, vec_type>::call 
-        };
+//template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);

-        callers[borderMode](DevMem2D_<vec_type>(src), DevMem2D_<vec_type>(dst), top, left, borderValue, stream);
-    }
+template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);

-    template void copyMakeBorder_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const uchar* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);

-    //template void copyMakeBorder_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const schar* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);

-    template void copyMakeBorder_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const ushort* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+//template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
+template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);

-    template void copyMakeBorder_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const short* borderValue, cudaStream_t stream);
+} // namespace copy_make_border

-    //template void copyMakeBorder_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const int* borderValue, cudaStream_t stream);
-    
-    template void copyMakeBorder_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    //template void copyMakeBorder_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-    template void copyMakeBorder_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const float* borderValue, cudaStream_t stream);
-}}}
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@ -45,9 +45,7 @@
 #include "opencv2/gpu/device/utility.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-using namespace cv::gpu;
-
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

 #define UINT_BITS 32U

@ -67,154 +65,157 @@ using namespace cv::gpu::device;

 #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)

-namespace cv { namespace gpu { namespace histograms
+namespace hist {
+
+#if (!USE_SMEM_ATOMICS)
+
+    #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
+
+    __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
+    {
+        uint count;
+        do
+        {
+            count = s_WarpHist[data] & TAG_MASK;
+            count = threadTag | (count + 1);
+            s_WarpHist[data] = count;
+        } while (s_WarpHist[data] != count);
+    }
+
+#else
+
+    #define TAG_MASK 0xFFFFFFFFU
+
+    __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
+    {
+        atomicAdd(s_WarpHist + data, 1);
+    }
+
+#endif
+
+__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
 {
-    #if (!USE_SMEM_ATOMICS)
+    uint x = pos_x << 2;

-        #define TAG_MASK ( (1U << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE)) - 1U )
+    if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
+    if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
+    if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
+    if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+}

-        __forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
-        {
-            uint count;
-            do
-            {
-                count = s_WarpHist[data] & TAG_MASK;
-                count = threadTag | (count + 1);
-                s_WarpHist[data] = count;
-            } while (s_WarpHist[data] != count);
-        }
+__global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
+{
+    //Per-warp subhistogram storage
+    __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
+    uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;

-    #else
+    //Clear shared memory storage for current threadblock before processing
+    #pragma unroll
+    for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
+       s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;

-        #define TAG_MASK 0xFFFFFFFFU
+    //Cycle through the entire data set, update subhistograms for each warp
+    const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);

-        __forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
-        {
-            atomicAdd(s_WarpHist + data, 1);
-        }
-
-    #endif
-
-    __forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
+    __syncthreads();
+    const uint colsui = d_Data.step / sizeof(uint);
+    for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
    {
-        uint x = pos_x << 2;
-
-        if (x + 0 < cols) addByte(s_WarpHist, (data >>  0) & 0xFFU, tag);
-        if (x + 1 < cols) addByte(s_WarpHist, (data >>  8) & 0xFFU, tag);
-        if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
-        if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
+        uint pos_y = pos / colsui;
+        uint pos_x = pos % colsui;
+        uint data = d_Data.ptr(pos_y)[pos_x];
+        addWord(s_WarpHist, data, tag, pos_x, cols);
    }

-    __global__ void histogram256(const PtrStep<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
-    {
-        //Per-warp subhistogram storage
-        __shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
-        uint* s_WarpHist= s_Hist + (threadIdx.x >> OPENCV_GPU_LOG_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
-
-        //Clear shared memory storage for current threadblock before processing
-        #pragma unroll
-        for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
-           s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
-
-        //Cycle through the entire data set, update subhistograms for each warp
-        const uint tag = threadIdx.x << (UINT_BITS - OPENCV_GPU_LOG_WARP_SIZE);
-
-        __syncthreads();
-        const uint colsui = d_Data.step / sizeof(uint);
-        for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
-        {
-            uint pos_y = pos / colsui;
-            uint pos_x = pos % colsui;
-            uint data = d_Data.ptr(pos_y)[pos_x];
-            addWord(s_WarpHist, data, tag, pos_x, cols);
-        }
-
-        //Merge per-warp histograms into per-block and write to global memory
-        __syncthreads();
-        for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
-        {
-            uint sum = 0;
-
-            for (uint i = 0; i < WARP_COUNT; i++)
-                sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
-
-            d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
-        }
-    }
-
-    ////////////////////////////////////////////////////////////////////////////////
-    // Merge histogram256() output
-    // Run one threadblock per bin; each threadblock adds up the same bin counter
-    // from every partial histogram. Reads are uncoalesced, but mergeHistogram256
-    // takes only a fraction of total processing time
-    ////////////////////////////////////////////////////////////////////////////////
-
-    __global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+    //Merge per-warp histograms into per-block and write to global memory
+    __syncthreads();
+    for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
    {
        uint sum = 0;

-        #pragma unroll
-        for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
-            sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
+        for (uint i = 0; i < WARP_COUNT; i++)
+            sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;

-        __shared__ uint data[MERGE_THREADBLOCK_SIZE];
-        data[threadIdx.x] = sum;
-
-        for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
-        {
-            __syncthreads();
-            if(threadIdx.x < stride)
-                data[threadIdx.x] += data[threadIdx.x + stride];
-        }
-
-        if(threadIdx.x == 0)
-            d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+        d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
    }
+}

-    void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+////////////////////////////////////////////////////////////////////////////////
+// Merge histogram256() output
+// Run one threadblock per bin; each threadblock adds up the same bin counter
+// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
+// takes only a fraction of total processing time
+////////////////////////////////////////////////////////////////////////////////
+
+__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
+{
+    uint sum = 0;
+
+    #pragma unroll
+    for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
+        sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
+
+    __shared__ uint data[MERGE_THREADBLOCK_SIZE];
+    data[threadIdx.x] = sum;
+
+    for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
    {
-        histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-            DevMem2D_<uint>(src),
-            buf, 
-            static_cast<uint>(src.rows * src.step / sizeof(uint)),
-            src.cols);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        __syncthreads();
+        if(threadIdx.x < stride)
+            data[threadIdx.x] += data[threadIdx.x + stride];
    }

-    __constant__ int c_lut[256];
+    if(threadIdx.x == 0)
+        d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
+}

-    __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+{
+    histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
+        DevMem2D_<uint>(src),
+        buf, 
+        static_cast<uint>(src.rows * src.step / sizeof(uint)),
+        src.cols);
+
+    cudaSafeCall( cudaGetLastError() );
+
+    mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
+
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+__constant__ int c_lut[256];
+
+__global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < src.cols && y < src.rows)
    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x < src.cols && y < src.rows)
-        {
-            const uchar val = src.ptr(y)[x];
-            const int lut = c_lut[val];
-            dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
-        }
+        const uchar val = src.ptr(y)[x];
+        const int lut = c_lut[val];
+        dst.ptr(y)[x] = __float2int_rn(255.0f / (src.cols * src.rows) * lut);
    }
+}

-    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
-    {
-        dim3 block(16, 16);
-        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
+void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
+{
+    dim3 block(16, 16);
+    dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        cudaSafeCall( cudaMemcpyToSymbol(cv::gpu::histograms::c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
+    cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );

-        equalizeHist<<<grid, block, 0, stream>>>(src, dst);
-        cudaSafeCall( cudaGetLastError() );
+    equalizeHist<<<grid, block, 0, stream>>>(src, dst);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-}}}
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+} // namespace hist
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@ -42,13 +42,15 @@

 #include "internal_shared.hpp"

+BEGIN_OPENCV_DEVICE_NAMESPACE
+
 // Other values are not supported
 #define CELL_WIDTH 8
 #define CELL_HEIGHT 8
 #define CELLS_PER_BLOCK_X 2
 #define CELLS_PER_BLOCK_Y 2

-namespace cv { namespace gpu { namespace hog {
+namespace hog {

 __constant__ int cnbins;
 __constant__ int cblock_stride_x;
@ -83,23 +85,23 @@ int power_2up(unsigned int n)
 void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
                      int nblocks_win_x, int nblocks_win_y)
 {
-    uploadConstant("cv::gpu::hog::cnbins", nbins);
-    uploadConstant("cv::gpu::hog::cblock_stride_x", block_stride_x);
-    uploadConstant("cv::gpu::hog::cblock_stride_y", block_stride_y);
-    uploadConstant("cv::gpu::hog::cnblocks_win_x", nblocks_win_x);
-    uploadConstant("cv::gpu::hog::cnblocks_win_y", nblocks_win_y);
+    cudaSafeCall( cudaMemcpyToSymbol(cnbins, &nbins, sizeof(nbins)) ); 
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_x, &block_stride_x, sizeof(block_stride_x)) ); 
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_stride_y, &block_stride_y, sizeof(block_stride_y)) ); 
+    cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_x, &nblocks_win_x, sizeof(nblocks_win_x)) );  
+    cudaSafeCall( cudaMemcpyToSymbol(cnblocks_win_y, &nblocks_win_y, sizeof(nblocks_win_y)) ); 

    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y; 
-    uploadConstant("cv::gpu::hog::cblock_hist_size", block_hist_size);
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size, &block_hist_size, sizeof(block_hist_size)) ); 

    int block_hist_size_2up = power_2up(block_hist_size);  
-    uploadConstant("cv::gpu::hog::cblock_hist_size_2up", block_hist_size_2up);
+    cudaSafeCall( cudaMemcpyToSymbol(cblock_hist_size_2up, &block_hist_size_2up, sizeof(block_hist_size_2up)) );

    int descr_width = nblocks_win_x * block_hist_size;
-    uploadConstant("cv::gpu::hog::cdescr_width", descr_width);
+    cudaSafeCall( cudaMemcpyToSymbol(cdescr_width, &descr_width, sizeof(descr_width)) );

    int descr_size = descr_width * nblocks_win_y;
-    uploadConstant("cv::gpu::hog::cdescr_size", descr_size);
+    cudaSafeCall( cudaMemcpyToSymbol(cdescr_size, &descr_size, sizeof(descr_size)) );
 }


@ -153,10 +155,10 @@ __global__ void compute_hists_kernel_many_blocks(const int img_block_width, cons
            int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
            int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);

-            float gaussian = expf(-(dist_center_y * dist_center_y + 
-                                    dist_center_x * dist_center_x) * scale);
-            float interp_weight = (8.f - fabs(dist_y + 0.5f)) * 
-                                  (8.f - fabs(dist_x + 0.5f)) / 64.f;
+            float gaussian = ::expf(-(dist_center_y * dist_center_y + 
+                                      dist_center_x * dist_center_x) * scale);
+            float interp_weight = (8.f - ::fabs(dist_y + 0.5f)) * 
+                                  (8.f - ::fabs(dist_x + 0.5f)) / 64.f;

            hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
            hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
@ -273,15 +275,15 @@ __global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
    __syncthreads();
    float sum = reduce_smem<nthreads>(squares);
    
-    float scale = 1.0f / (sqrtf(sum) + 0.1f * block_hist_size);        
-    elem = min(elem * scale, threshold);
+    float scale = 1.0f / (::sqrtf(sum) + 0.1f * block_hist_size);        
+    elem = ::min(elem * scale, threshold);
    
    __syncthreads();
    squares[threadIdx.x] = elem * elem;

    __syncthreads();
    sum = reduce_smem<nthreads>(squares);
-    scale = 1.0f / (sqrtf(sum) + 1e-3f);
+    scale = 1.0f / (::sqrtf(sum) + 1e-3f);
    
    if (threadIdx.x < block_hist_size)
        hist[0] = elem * scale;
@ -533,7 +535,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl

    if (threadIdx.x == 0)
    {
-        val = row[max(x - 1, 1)];
+        val = row[::max(x - 1, 1)];
        sh_row[0] = val.x;
        sh_row[(nthreads + 2)] = val.y;
        sh_row[2 * (nthreads + 2)] = val.z;
@ -541,7 +543,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl

    if (threadIdx.x == blockDim.x - 1)
    {
-        val = row[min(x + 1, width - 2)];
+        val = row[::min(x + 1, width - 2)];
        sh_row[blockDim.x + 1] = val.x;
        sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
        sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
@ -561,7 +563,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl

        float3 dx;
        if (correct_gamma)
-            dx = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z));    
+            dx = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));    
        else
            dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    

@ -576,7 +578,7 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
            b = make_float3(val.x, val.y, val.z);

            if (correct_gamma)
-                dy = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z));
+                dy = make_float3(::sqrtf(b.x) - ::sqrtf(a.x), ::sqrtf(b.y) - ::sqrtf(a.y), ::sqrtf(b.z) - ::sqrtf(a.z));
            else
                dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
        }
@ -601,10 +603,10 @@ __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrEl
            mag0 = mag1;
        }

-        mag0 = sqrtf(mag0);
+        mag0 = ::sqrtf(mag0);

-        float ang = (atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
-        int hidx = (int)floorf(ang);
+        float ang = (::atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)::floorf(ang);
        ang -= hidx;
        hidx = (hidx + cnbins) % cnbins;

@ -648,10 +650,10 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
        sh_row[threadIdx.x + 1] = row[width - 2];

    if (threadIdx.x == 0)
-        sh_row[0] = row[max(x - 1, 1)];
+        sh_row[0] = row[::max(x - 1, 1)];

    if (threadIdx.x == blockDim.x - 1)
-        sh_row[blockDim.x + 1] = row[min(x + 1, width - 2)];
+        sh_row[blockDim.x + 1] = row[::min(x + 1, width - 2)];

    __syncthreads();
    if (x < width)
@ -659,7 +661,7 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
        float dx;

        if (correct_gamma)
-            dx = sqrtf(sh_row[threadIdx.x + 2]) - sqrtf(sh_row[threadIdx.x]);
+            dx = ::sqrtf(sh_row[threadIdx.x + 2]) - ::sqrtf(sh_row[threadIdx.x]);
        else
            dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];

@ -669,14 +671,14 @@ __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrEl
            float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
            float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
            if (correct_gamma)
-                dy = sqrtf(a) - sqrtf(b);
+                dy = ::sqrtf(a) - ::sqrtf(b);
            else
                dy = a - b;
        }
-        float mag = sqrtf(dx * dx + dy * dy);
+        float mag = ::sqrtf(dx * dx + dy * dy);

-        float ang = (atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
-        int hidx = (int)floorf(ang);
+        float ang = (::atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
+        int hidx = (int)::floorf(ang);
        ang -= hidx;
        hidx = (hidx + cnbins) % cnbins;

@ -768,4 +770,6 @@ static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
 void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
 void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }

-}}}
+} // namespace hog 
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/internal_shared.hpp
+++ b/modules/gpu/src/cuda/internal_shared.hpp
@ -43,11 +43,15 @@
 #ifndef __OPENCV_internal_shared_HPP__
 #define __OPENCV_internal_shared_HPP__

+#include <cuda_runtime.h>
+#include <npp.h>
+#include "NPP_staging.hpp"
 #include "opencv2/gpu/devmem2d.hpp"
 #include "safe_call.hpp"
-#include "cuda_runtime.h"
-#include "npp.h"
-#include "NPP_staging.hpp"
+
+#ifndef CV_PI
+#define CV_PI   3.1415926535897932384626433832795f
+#endif

 #ifndef CV_PI_F
  #ifndef CV_PI
@ -57,146 +61,158 @@
  #endif
 #endif

-namespace cv
+#define BEGIN_OPENCV_DEVICE_NAMESPACE namespace cv { namespace gpu { namespace device { 
+#define END_OPENCV_DEVICE_NAMESPACE   }}}
+#define OPENCV_DEVICE_NAMESPACE       ::cv::gpu::device
+#define OPENCV_DEVICE_NAMESPACE_      ::cv::gpu::device:: 
+
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef signed char schar;
+typedef unsigned int uint;
+
+template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
 {
-    namespace gpu
-    {
-        typedef unsigned char uchar;
-        typedef signed char schar;
-        typedef unsigned short ushort;
-        typedef unsigned int uint;       
-
-        enum 
-        {
-            BORDER_REFLECT101_GPU = 0,
-            BORDER_REPLICATE_GPU,
-            BORDER_CONSTANT_GPU,
-            BORDER_REFLECT_GPU,
-            BORDER_WRAP_GPU
-        };
-                
-        // Converts CPU border extrapolation mode into GPU internal analogue.
-        // Returns true if the GPU analogue exists, false otherwise.
-        bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
-
-        static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
-
-        template<class T> static inline void uploadConstant(const char* name, const T& value) 
-        { 
-            cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); 
-        }
-
-        template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) 
-        {
-            cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); 
-        }        
-
-        template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img/*, bool normalized = false,
-            enum cudaTextureFilterMode filterMode = cudaFilterModePoint, enum cudaTextureAddressMode addrMode = cudaAddressModeClamp*/)
-        {            
-            //!!!! const_cast is disabled!
-            //!!!! Please use constructor of 'class texture'  instead.
-
-            //textureReference* tex; 
-            //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); 
-            //tex->normalized = normalized;
-            //tex->filterMode = filterMode;
-            //tex->addressMode[0] = addrMode;
-            //tex->addressMode[1] = addrMode;
-            
-            const textureReference* tex; 
-            cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-            cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-        }
-
-        static inline void unbindTexture(const char *name)
-        {
-            const textureReference* tex; 
-            cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
-            cudaSafeCall( cudaUnbindTexture(tex) );
-        }
-
-        class TextureBinder
-        {
-        public:
-            TextureBinder() : tex_(0) {}
-            template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
-            {
-                bind(tex, img);
-            }
-            template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
-            {
-                bind(tex_name, img);
-            }
-            ~TextureBinder() { unbind(); }
-
-            template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
-            {
-                unbind();
-
-                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-                cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
-
-                tex_ = tex;
-            }
-            template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
-            {
-                const textureReference* tex; 
-                cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
-                bind(tex, img);
-            }
-
-            void unbind()
-            {
-                if (tex_)
-                {
-                    cudaUnbindTexture(tex_);
-                    tex_ = 0;
-                }
-            }
-
-        private:
-            const textureReference* tex_;
-        };
-
-        class NppStreamHandler
-        {
-        public:
-            inline explicit NppStreamHandler(cudaStream_t newStream = 0)
-            {
-                oldStream = nppGetStream();
-                nppSetStream(newStream);
-            }
-
-            inline ~NppStreamHandler()
-            {
-                nppSetStream(oldStream);
-            }
-
-        private:
-            cudaStream_t oldStream;
-        };
-
-        class NppStStreamHandler
-        {
-        public:
-            inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
-            {
-                oldStream = nppStSetActiveCUDAstream(newStream);
-            }
-
-            inline ~NppStStreamHandler()
-            {
-                nppStSetActiveCUDAstream(oldStream);
-            }
-
-        private:
-            cudaStream_t oldStream;
-        };
-    }
+    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
 }

+END_OPENCV_DEVICE_NAMESPACE
+
+namespace cv { namespace gpu 
+{
+    enum 
+    {
+        BORDER_REFLECT101_GPU = 0,
+        BORDER_REPLICATE_GPU,
+        BORDER_CONSTANT_GPU,
+        BORDER_REFLECT_GPU,
+        BORDER_WRAP_GPU
+    };
+            
+    // Converts CPU border extrapolation mode into GPU internal analogue.
+    // Returns true if the GPU analogue exists, false otherwise.
+    bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
+
+    static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
+
+    /*template<class T> static inline void uploadConstant(const char* name, const T& value) 
+    { 
+        cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); 
+    }
+
+    template<class T> static inline void uploadConstant(const char* name, const T& value, cudaStream_t stream) 
+    {
+        cudaSafeCall( cudaMemcpyToSymbolAsync(name, &value, sizeof(T), 0, cudaMemcpyHostToDevice, stream) ); 
+    }   */     
+
+    //template<class T> static inline void bindTexture(const char* name, const DevMem2D_<T>& img)
+    //{            
+    //    //!!!! const_cast is disabled!
+    //    //!!!! Please use constructor of 'class texture'  instead.
+    //
+    //    //textureReference* tex; 
+    //    //cudaSafeCall( cudaGetTextureReference((const textureReference**)&tex, name) ); 
+    //    //tex->normalized = normalized;
+    //    //tex->filterMode = filterMode;
+    //    //tex->addressMode[0] = addrMode;
+    //    //tex->addressMode[1] = addrMode;
+    //    
+    //    const textureReference* tex; 
+    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
+    //
+    //    cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+    //    cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+    //}
+
+    //static inline void unbindTexture(const char *name)
+    //{
+    //    const textureReference* tex; 
+    //    cudaSafeCall( cudaGetTextureReference(&tex, name) ); 
+    //    cudaSafeCall( cudaUnbindTexture(tex) );
+    //}
+
+    
+
+    //class TextureBinder
+    //{
+    //public:
+    //    TextureBinder() : tex_(0) {}
+    //    template <typename T> TextureBinder(const textureReference* tex, const DevMem2D_<T>& img) : tex_(0)
+    //    {
+    //        bind(tex, img);
+    //    }
+    //    template <typename T> TextureBinder(const char* tex_name, const DevMem2D_<T>& img) : tex_(0)
+    //    {
+    //        bind(tex_name, img);
+    //    }
+    //    ~TextureBinder() { unbind(); }
+    //
+    //    template <typename T> void bind(const textureReference* tex, const DevMem2D_<T>& img)
+    //    {
+    //        unbind();
+    //
+    //        cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+    //        cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
+    //
+    //        tex_ = tex;
+    //    }
+    //    template <typename T> void bind(const char* tex_name, const DevMem2D_<T>& img)
+    //    {
+    //        const textureReference* tex; 
+    //        cudaSafeCall( cudaGetTextureReference(&tex, tex_name) ); 
+    //        bind(tex, img);
+    //    }
+    //
+    //    void unbind()
+    //    {
+    //        if (tex_)
+    //        {
+    //            cudaUnbindTexture(tex_);
+    //            tex_ = 0;
+    //        }
+    //    }
+    //
+    //private:
+    //    const textureReference* tex_;
+    //};
+
+    class NppStreamHandler
+    {
+    public:
+        inline explicit NppStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppGetStream();
+            nppSetStream(newStream);
+        }
+
+        inline ~NppStreamHandler()
+        {
+            nppSetStream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+
+    class NppStStreamHandler
+    {
+    public:
+        inline explicit NppStStreamHandler(cudaStream_t newStream = 0)
+        {
+            oldStream = nppStSetActiveCUDAstream(newStream);
+        }
+
+        inline ~NppStStreamHandler()
+        {
+            nppStSetActiveCUDAstream(oldStream);
+        }
+
+    private:
+        cudaStream_t oldStream;
+    };
+}}

 #endif /* __OPENCV_internal_shared_HPP__ */
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@ -43,10 +43,9 @@
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace imgproc {
+namespace match_template {

 __device__ __forceinline__ float sum(float v) { return v; }
 __device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
@ -266,9 +265,9 @@ void matchTemplatePrepared_SQDIFF_8U(int w, int h, const DevMem2D_<unsigned long

 __device__ float normAcc(float num, float denum)
 {
-    if (fabs(num) < denum)
+    if (::fabs(num) < denum)
        return num / denum;
-    if (fabs(num) < denum * 1.125f)
+    if (::fabs(num) < denum * 1.125f)
        return num > 0 ? 1 : -1;
    return 0;
 }
@ -276,9 +275,9 @@ __device__ float normAcc(float num, float denum)

 __device__ float normAcc_SQDIFF(float num, float denum)
 {
-    if (fabs(num) < denum)
+    if (::fabs(num) < denum)
        return num / denum;
-    if (fabs(num) < denum * 1.125f)
+    if (::fabs(num) < denum * 1.125f)
        return num > 0 ? 1 : -1;
    return 1;
 }
@ -906,4 +905,7 @@ void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cu
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
-}}}
+
+} //namespace match_template
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -42,178 +42,174 @@

 #include "internal_shared.hpp"

-using namespace cv::gpu;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-#ifndef CV_PI
-#define CV_PI   3.1415926535897932384626433832795f
-#endif
+namespace mathfunc {

 //////////////////////////////////////////////////////////////////////////////////////
 // Cart <-> Polar

-namespace cv { namespace gpu { namespace mathfunc
+struct Nothing
 {
-    struct Nothing
+    static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
    {
-        static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
-        {
-        }
-    };
-    struct Magnitude
-    {
-        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-        {
-            dst[y * dst_step + x] = sqrtf(x_data * x_data + y_data * y_data);
-        }
-    };
-    struct MagnitudeSqr
-    {
-        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
-        {
-            dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
-        }
-    };
-    struct Atan2
-    {
-        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
-        {
-            float angle = atan2f(y_data, x_data);
-            angle += (angle < 0) * 2.0 * CV_PI;
-            dst[y * dst_step + x] = scale * angle;
-        }
-    };
-    template <typename Mag, typename Angle>
-    __global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
-                                float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
-    {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < width && y < height)
-        {
-            float x_data = xptr[y * x_step + x];
-            float y_data = yptr[y * y_step + x];
-
-            Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
-            Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
-        }
    }
-
-    struct NonEmptyMag
+};
+struct Magnitude
+{
+    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
    {
-        static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
-        {
-            return mag[y * mag_step + x];
-        }
-    };
-    struct EmptyMag
-    {
-        static __device__ __forceinline__ float get(const float*, size_t, int, int)
-        {
-            return 1.0f;
-        }
-    };
-    template <typename Mag>
-    __global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
-        float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
-    {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (x < width && y < height)
-        {
-            float mag_data = Mag::get(mag, mag_step, x, y);
-            float angle_data = angle[y * angle_step + x];
-            float sin_a, cos_a;
-
-            sincosf(scale * angle_data, &sin_a, &cos_a);
-
-            xptr[y * x_step + x] = mag_data * cos_a;
-            yptr[y * y_step + x] = mag_data * sin_a;
-        }
+        dst[y * dst_step + x] = ::sqrtf(x_data * x_data + y_data * y_data);
    }
-
-    template <typename Mag, typename Angle>
-    void cartToPolar_caller(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
+};
+struct MagnitudeSqr
+{
+    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(x.cols, threads.x);
-        grid.y = divUp(x.rows, threads.y);
-        
-        const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
-
-        cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-            x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
-            mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
    }
-
-    void cartToPolar_gpu(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, bool magSqr, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream)
+};
+struct Atan2
+{
+    static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+    {
+        float angle = ::atan2f(y_data, x_data);
+        angle += (angle < 0) * 2.0 * CV_PI;
+        dst[y * dst_step + x] = scale * angle;
+    }
+};
+template <typename Mag, typename Angle>
+__global__ void cartToPolar(const float* xptr, size_t x_step, const float* yptr, size_t y_step, 
+                            float* mag, size_t mag_step, float* angle, size_t angle_step, float scale, int width, int height)
+{
+	const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x < width && y < height)
+    {
+        float x_data = xptr[y * x_step + x];
+        float y_data = yptr[y * y_step + x];
+
+        Mag::calc(x, y, x_data, y_data, mag, mag_step, scale);
+        Angle::calc(x, y, x_data, y_data, angle, angle_step, scale);
+    }
+}
+
+struct NonEmptyMag
+{
+    static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
+    {
+        return mag[y * mag_step + x];
+    }
+};
+struct EmptyMag
+{
+    static __device__ __forceinline__ float get(const float*, size_t, int, int)
+    {
+        return 1.0f;
+    }
+};
+template <typename Mag>
+__global__ void polarToCart(const float* mag, size_t mag_step, const float* angle, size_t angle_step, float scale,
+    float* xptr, size_t x_step, float* yptr, size_t y_step, int width, int height)
+{
+	const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x < width && y < height)
+    {
+        float mag_data = Mag::get(mag, mag_step, x, y);
+        float angle_data = angle[y * angle_step + x];
+        float sin_a, cos_a;
+
+        ::sincosf(scale * angle_data, &sin_a, &cos_a);
+
+        xptr[y * x_step + x] = mag_data * cos_a;
+        yptr[y * y_step + x] = mag_data * sin_a;
+    }
+}
+
+template <typename Mag, typename Angle>
+void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);
+
+    grid.x = divUp(x.cols, threads.x);
+    grid.y = divUp(x.rows, threads.y);
+    
+    const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+
+    cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
+        x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), 
+        mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+{
+    typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+    static const caller_t callers[2][2][2] = 
    {
-        typedef void (*caller_t)(const DevMem2Df& x, const DevMem2Df& y, const DevMem2Df& mag, const DevMem2Df& angle, bool angleInDegrees, cudaStream_t stream);
-        static const caller_t callers[2][2][2] = 
        {
            {
-                {
-                    cartToPolar_caller<Magnitude, Atan2>,
-                    cartToPolar_caller<Magnitude, Nothing>
-                },
-                {
-                    cartToPolar_caller<MagnitudeSqr, Atan2>,
-                    cartToPolar_caller<MagnitudeSqr, Nothing>,
-                }
+                cartToPolar_caller<Magnitude, Atan2>,
+                cartToPolar_caller<Magnitude, Nothing>
            },
            {
-                {
-                    cartToPolar_caller<Nothing, Atan2>,
-                    cartToPolar_caller<Nothing, Nothing>
-                },
-                {
-                    cartToPolar_caller<Nothing, Atan2>,
-                    cartToPolar_caller<Nothing, Nothing>,
-                }
+                cartToPolar_caller<MagnitudeSqr, Atan2>,
+                cartToPolar_caller<MagnitudeSqr, Nothing>,
            }
-        };
-
-        callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
-    }
-
-    template <typename Mag>
-    void polarToCart_caller(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(mag.cols, threads.x);
-        grid.y = divUp(mag.rows, threads.y);
-        
-        const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
-
-        polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
-            angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    void polarToCart_gpu(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2Df& mag, const DevMem2Df& angle, const DevMem2Df& x, const DevMem2Df& y, bool angleInDegrees, cudaStream_t stream);
-        static const caller_t callers[2] = 
+        },
        {
-            polarToCart_caller<NonEmptyMag>,
-            polarToCart_caller<EmptyMag>
-        };
+            {
+                cartToPolar_caller<Nothing, Atan2>,
+                cartToPolar_caller<Nothing, Nothing>
+            },
+            {
+                cartToPolar_caller<Nothing, Atan2>,
+                cartToPolar_caller<Nothing, Nothing>,
+            }
+        }
+    };

-        callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
-    }
-}}}
+    callers[mag.data == 0][magSqr][angle.data == 0](x, y, mag, angle, angleInDegrees, stream);
+}

+template <typename Mag>
+void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

+    grid.x = divUp(mag.cols, threads.x);
+    grid.y = divUp(mag.rows, threads.y);
    
+    const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+
+    polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(), 
+        angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+{
+    typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+    static const caller_t callers[2] = 
+    {
+        polarToCart_caller<NonEmptyMag>,
+        polarToCart_caller<EmptyMag>
+    };
+
+    callers[mag.data == 0](mag, angle, x, y, angleInDegrees, stream);
+}
+
+} // namespace mathfunc
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@ -45,302 +45,304 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"

-namespace cv { namespace gpu { namespace device {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T> struct shift_and_sizeof;
-    template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
-    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
+template <typename T> struct shift_and_sizeof;
+template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
+template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
+template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
+template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
+template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
+template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
+template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };

 ///////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// CopyTo /////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////

-    template<typename T>
-    __global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+template<typename T>
+__global__ void copy_to_with_mask(const T* mat_src, T* mat_dst, const uchar* mask, int cols, int rows, size_t step_mat, size_t step_mask, int channels)
+{
+    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t y = blockIdx.y * blockDim.y + threadIdx.y;

-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
-                mat_dst[idx] = mat_src[idx];
-            }
-    }
+    if ((x < cols * channels ) && (y < rows))
+        if (mask[y * step_mask + x / channels] != 0)
+        {
+            size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
+            mat_dst[idx] = mat_src[idx];
+        }
+}
+
+template<typename T>
+void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+{
+    dim3 threadsPerBlock(16,16, 1);
+    dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
+
+    copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+            ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall ( cudaDeviceSynchronize() );
+}
+
+void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+{
    typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream);

-    template<typename T>
-    void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
+    static CopyToFunc tab[8] =
    {
-        dim3 threadsPerBlock(16,16, 1);
-        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
+        copy_to_with_mask_run<unsigned char>,
+        copy_to_with_mask_run<signed char>,
+        copy_to_with_mask_run<unsigned short>,
+        copy_to_with_mask_run<short>,
+        copy_to_with_mask_run<int>,
+        copy_to_with_mask_run<float>,
+        copy_to_with_mask_run<double>,
+        0
+    };

-        copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
-                ((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-        cudaSafeCall( cudaGetLastError() );
+    CopyToFunc func = tab[depth];

-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
+    if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);

-    void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream)
-    {
-        static CopyToFunc tab[8] =
-        {
-            copy_to_with_mask_run<unsigned char>,
-            copy_to_with_mask_run<signed char>,
-            copy_to_with_mask_run<unsigned short>,
-            copy_to_with_mask_run<short>,
-            copy_to_with_mask_run<int>,
-            copy_to_with_mask_run<float>,
-            copy_to_with_mask_run<double>,
-            0
-        };
-
-        CopyToFunc func = tab[depth];
-
-        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
-
-        func(mat_src, mat_dst, mask, channels, stream);
-    }
+    func(mat_src, mat_dst, mask, channels, stream);
+}

 ///////////////////////////////////////////////////////////////////////////
 ////////////////////////////////// SetTo //////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////

-    __constant__ uchar scalar_8u[4];
-    __constant__ schar scalar_8s[4];
-    __constant__ ushort scalar_16u[4];
-    __constant__ short scalar_16s[4];
-    __constant__ int scalar_32s[4];
-    __constant__ float scalar_32f[4]; 
-    __constant__ double scalar_64f[4];
+__constant__ uchar scalar_8u[4];
+__constant__ schar scalar_8s[4];
+__constant__ ushort scalar_16u[4];
+__constant__ short scalar_16s[4];
+__constant__ int scalar_32s[4];
+__constant__ float scalar_32f[4]; 
+__constant__ double scalar_64f[4];

-    template <typename T> __device__ __forceinline__ T readScalar(int i);
-    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
-    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
-    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
-    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}
+template <typename T> __device__ __forceinline__ T readScalar(int i);
+template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
+template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
+template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
+template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
+template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
+template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
+template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}

-    void writeScalar(const uchar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
-    }
-    void writeScalar(const schar* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
-    }
-    void writeScalar(const ushort* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
-    }
-    void writeScalar(const short* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
-    }
-    void writeScalar(const int* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
-    }
-    void writeScalar(const float* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
-    }
-    void writeScalar(const double* vals)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
-    }
+void writeScalar(const uchar* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
+}
+void writeScalar(const schar* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
+}
+void writeScalar(const ushort* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
+}
+void writeScalar(const short* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
+}
+void writeScalar(const int* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
+}
+void writeScalar(const float* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
+}
+void writeScalar(const double* vals)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
+}

-    template<typename T>
-    __global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+template<typename T>
+__global__ void set_to_without_mask(T* mat, int cols, int rows, size_t step, int channels)
+{
+    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t y = blockIdx.y * blockDim.y + threadIdx.y;

-        if ((x < cols * channels ) && (y < rows))
+    if ((x < cols * channels ) && (y < rows))
+    {
+        size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
+        mat[idx] = readScalar<T>(x % channels);
+    }
+}
+
+template<typename T>
+__global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
+{
+    size_t x = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if ((x < cols * channels ) && (y < rows))
+        if (mask[y * step_mask + x / channels] != 0)
        {
            size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
            mat[idx] = readScalar<T>(x % channels);
        }
-    }
+}
+template <typename T>
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
+{
+    writeScalar(scalar);

-    template<typename T>
-    __global__ void set_to_with_mask(T* mat, const uchar* mask, int cols, int rows, size_t step, int channels, size_t step_mask)
-    {
-        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
-        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
+    dim3 threadsPerBlock(32, 8, 1);
+    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);

-        if ((x < cols * channels ) && (y < rows))
-            if (mask[y * step_mask + x / channels] != 0)
-            {
-                size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
-                mat[idx] = readScalar<T>(x % channels);
-            }
-    }
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream)
-    {
-        writeScalar(scalar);
+    set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
+    cudaSafeCall( cudaGetLastError() );

-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+    if (stream == 0)
+        cudaSafeCall ( cudaDeviceSynchronize() );
+}

-        set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
-        cudaSafeCall( cudaGetLastError() );
+template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);

-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
+template <typename T>
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
+{
+    writeScalar(scalar);

-    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+    dim3 threadsPerBlock(32, 8, 1);
+    dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);

-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream)
-    {
-        writeScalar(scalar);
+    set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
+    cudaSafeCall( cudaGetLastError() );

-        dim3 threadsPerBlock(32, 8, 1);
-        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
+    if (stream == 0)
+        cudaSafeCall ( cudaDeviceSynchronize() );
+}

-        set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
-
-    template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<int   >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream);
+template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream);

 ///////////////////////////////////////////////////////////////////////////
 //////////////////////////////// ConvertTo ////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////

-    template <typename T, typename D> struct Convertor : unary_function<T, D>
+template <typename T, typename D> struct Convertor : unary_function<T, D>
+{
+    Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
+
+    __device__ __forceinline__ D operator()(const T& src) const
    {
-        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
-
-        __device__ __forceinline__ D operator()(const T& src) const
-        {
-            return saturate_cast<D>(alpha * src + beta);
-        }
-
-        const double alpha, beta;
-    };
-
-    namespace detail
-    {
-        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
-        {
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 8 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 4 };
-        };
-
-        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 2 };
-        };
-
-        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 2 };
-        };
-
-        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
-        {
-        };
+        return saturate_cast<D>(alpha * src + beta);
    }

-    template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+    const double alpha, beta;
+};
+
+namespace detail
+{
+    template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
    {
    };
-        
-    template<typename T, typename D>
-    void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
+    template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
    {
-        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
-        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
-        Convertor<T, D> op(alpha, beta);
-        transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
-    }
-
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
-        cudaStream_t stream = 0)
+        enum { smart_shift = 8 };
+    };
+    template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
    {
-        typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
-            cudaStream_t stream);
+        enum { smart_shift = 4 };
+    };
+    template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };

-        static const caller_t tab[8][8] =
-        {
-            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
+    template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
+    {
+        enum { smart_shift = 4 };
+    };
+    template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
+    {
+        enum { smart_shift = 2 };
+    };

-            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
+    template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 4 };
+    };
+    template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
+    {
+        enum { smart_block_dim_y = 8 };
+        enum { smart_shift = 2 };
+    };

-            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
+    template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
+    {
+    };
+}

-            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+template <typename T, typename D> struct TransformFunctorTraits< Convertor<T, D> > : detail::ConvertTraits< Convertor<T, D> >
+{
+};
    
-            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+template<typename T, typename D>
+void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream)
+{
+    cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
+    cudaSafeCall( cudaSetDoubleForDevice(&beta) );
+    Convertor<T, D> op(alpha, beta);
+    OPENCV_DEVICE_NAMESPACE_ transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
+}

-            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, 
+    cudaStream_t stream = 0)
+{
+    typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, 
+        cudaStream_t stream);

-            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+    static const caller_t tab[8][8] =
+    {
+        {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+        cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},

-            {0,0,0,0,0,0,0,0}
-        };
+        {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
+        cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},

-        caller_t func = tab[sdepth][ddepth];
-        if (!func)
-            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+        {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
+        cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},

-        func(src, dst, alpha, beta, stream);
-    }
-}}}
+        {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
+        cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
+
+        {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
+        cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
+
+        {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
+        cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
+
+        {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
+        cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
+
+        {0,0,0,0,0,0,0,0}
+    };
+
+    caller_t func = tab[sdepth][ddepth];
+    if (!func)
+        cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
+
+    func(src, dst, alpha, beta, stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -40,79 +40,73 @@
 //
 //M*/

+#include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/transform.hpp"
-#include "internal_shared.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace mathfunc
+namespace matrix_reductions {
+
+// Performs reduction in shared memory
+template <int size, typename T>
+__device__ void sumInSmem(volatile T* data, const uint tid)
 {
+    T sum = data[tid];

-    // Performs reduction in shared memory
-    template <int size, typename T>
-    __device__ void sumInSmem(volatile T* data, const uint tid)
+    if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
+    if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
+    if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
+
+    if (tid < 32)
    {
-        T sum = data[tid];
+        if (size >= 64) data[tid] = sum = sum + data[tid + 32];
+        if (size >= 32) data[tid] = sum = sum + data[tid + 16];
+        if (size >= 16) data[tid] = sum = sum + data[tid + 8];
+        if (size >= 8) data[tid] = sum = sum + data[tid + 4];
+        if (size >= 4) data[tid] = sum = sum + data[tid + 2];
+        if (size >= 2) data[tid] = sum = sum + data[tid + 1];
+    }
+}

-        if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
-        if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
-        if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
+struct Mask8U
+{
+    explicit Mask8U(PtrStepb mask): mask(mask) {}

-        if (tid < 32)
-        {
-            if (size >= 64) data[tid] = sum = sum + data[tid + 32];
-            if (size >= 32) data[tid] = sum = sum + data[tid + 16];
-            if (size >= 16) data[tid] = sum = sum + data[tid + 8];
-            if (size >= 8) data[tid] = sum = sum + data[tid + 4];
-            if (size >= 4) data[tid] = sum = sum + data[tid + 2];
-            if (size >= 2) data[tid] = sum = sum + data[tid + 1];
-        }
+    __device__ __forceinline__ bool operator()(int y, int x) const 
+    { 
+        return mask.ptr(y)[x]; 
    }

+    PtrStepb mask;
+};

-    struct Mask8U
+struct MaskTrue 
+{ 
+    __device__ __forceinline__ bool operator()(int y, int x) const 
    { 
-        explicit Mask8U(PtrStepb mask): mask(mask) {}
+        return true; 
+    } 
+};

-        __device__ __forceinline__ bool operator()(int y, int x) const 
-        { 
-            return mask.ptr(y)[x]; 
-        }
+//////////////////////////////////////////////////////////////////////////////
+// Min max

-        PtrStepb mask;
-    };
-
-
-    struct MaskTrue 
-    { 
-        __device__ __forceinline__ bool operator()(int y, int x) const 
-        { 
-            return true; 
-        } 
-    };
-
-    //////////////////////////////////////////////////////////////////////////////
-    // Min max
-
-    // To avoid shared bank conflicts we convert each value into value of 
-    // appropriate type (32 bits minimum)
-    template <typename T> struct MinMaxTypeTraits {};
-    template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
-    template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
-    template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
-
-
-    namespace minmax 
-    {
+// To avoid shared bank conflicts we convert each value into value of 
+// appropriate type (32 bits minimum)
+template <typename T> struct MinMaxTypeTraits {};
+template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
+template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
+template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
+template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
+template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
+template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
+template <> struct MinMaxTypeTraits<double> { typedef double best_type; };

+namespace minmax 
+{
    __constant__ int ctwidth;
    __constant__ int ctheight;

@ -126,8 +120,8 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        threads = dim3(32, 8);
        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
+        grid.x = std::min(grid.x, threads.x);
+        grid.y = std::min(grid.y, threads.y);
    }


@ -155,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
    {
-        minval[tid] = min(minval[tid], minval[tid + offset]);
-        maxval[tid] = max(maxval[tid], maxval[tid + offset]);
+        minval[tid] = ::min(minval[tid], minval[tid + offset]);
+        maxval[tid] = ::max(maxval[tid], maxval[tid + offset]);
    }


@ -192,8 +186,8 @@ namespace cv { namespace gpu { namespace mathfunc

        T mymin = numeric_limits<T>::max();
        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min();
-        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
+        uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
+        uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
        for (uint y = y0; y < y_end; y += blockDim.y)
        {
            const T* src_row = (const T*)src.ptr(y);
@ -202,8 +196,8 @@ namespace cv { namespace gpu { namespace mathfunc
                T val = src_row[x];
                if (mask(y, x)) 
                { 
-                    mymin = min(mymin, val); 
-                    mymax = max(mymax, val); 
+                    mymin = ::min(mymin, val); 
+                    mymax = ::max(mymax, val); 
                }
            }
        }
@ -220,7 +214,7 @@ namespace cv { namespace gpu { namespace mathfunc
            maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
        }

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
 		__shared__ bool is_last;

 		if (tid == 0)
@ -237,7 +231,7 @@ namespace cv { namespace gpu { namespace mathfunc

 		if (is_last)
 		{
-            uint idx = min(tid, gridDim.x * gridDim.y - 1);
+            uint idx = ::min(tid, gridDim.x * gridDim.y - 1);

            sminval[tid] = minval[idx];
            smaxval[tid] = maxval[idx];
@ -332,7 +326,7 @@ namespace cv { namespace gpu { namespace mathfunc
        __shared__ best_type smaxval[nthreads];
        
        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        uint idx = min(tid, size - 1);
+        uint idx = ::min(tid, size - 1);

        sminval[tid] = minval[idx];
        smaxval[tid] = maxval[idx];
@ -410,14 +404,13 @@ namespace cv { namespace gpu { namespace mathfunc
    template void minMaxMultipassCaller<short>(const DevMem2Db, double*, double*, PtrStepb);
    template void minMaxMultipassCaller<int>(const DevMem2Db, double*, double*, PtrStepb);
    template void minMaxMultipassCaller<float>(const DevMem2Db, double*, double*, PtrStepb);
-
-    } // namespace minmax
+} // namespace minmax

 ///////////////////////////////////////////////////////////////////////////////
 // minMaxLoc

-    namespace minmaxloc {
-
+namespace minmaxloc 
+{
    __constant__ int ctwidth;
    __constant__ int ctheight;

@ -431,8 +424,8 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        threads = dim3(32, 8);
        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
+        grid.x = std::min(grid.x, threads.x);
+        grid.y = std::min(grid.y, threads.y);
    }


@ -513,12 +506,11 @@ namespace cv { namespace gpu { namespace mathfunc
        uint tid = threadIdx.y * blockDim.x + threadIdx.x;

        T mymin = numeric_limits<T>::max();
-        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : 
-                                                     numeric_limits<T>::min(); 
+        T mymax = numeric_limits<T>::is_signed ? -numeric_limits<T>::max() : numeric_limits<T>::min(); 
        uint myminloc = 0;
        uint mymaxloc = 0;
-        uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
-        uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
+        uint y_end = ::min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
+        uint x_end = ::min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);

        for (uint y = y0; y < y_end; y += blockDim.y)
        {
@ -542,7 +534,7 @@ namespace cv { namespace gpu { namespace mathfunc

        findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
 		__shared__ bool is_last;

 		if (tid == 0)
@ -561,7 +553,7 @@ namespace cv { namespace gpu { namespace mathfunc

 		if (is_last)
 		{
-            uint idx = min(tid, gridDim.x * gridDim.y - 1);
+            uint idx = ::min(tid, gridDim.x * gridDim.y - 1);

            sminval[tid] = minval[idx];
            smaxval[tid] = maxval[idx];
@ -685,7 +677,7 @@ namespace cv { namespace gpu { namespace mathfunc
        __shared__ uint smaxloc[nthreads];

        uint tid = threadIdx.y * blockDim.x + threadIdx.x;
-        uint idx = min(tid, size - 1);
+        uint idx = ::min(tid, size - 1);

        sminval[tid] = minval[idx];
        smaxval[tid] = maxval[idx];
@ -787,15 +779,13 @@ namespace cv { namespace gpu { namespace mathfunc
    template void minMaxLocMultipassCaller<short>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    template void minMaxLocMultipassCaller<int>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    template void minMaxLocMultipassCaller<float>(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
-
-    } // namespace minmaxloc
+} // namespace minmaxloc

 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 // countNonZero

-    namespace countnonzero 
-    {
-
+namespace countnonzero 
+{
    __constant__ int ctwidth;
    __constant__ int ctheight;

@ -805,8 +795,8 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        threads = dim3(32, 8);
        grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
+        grid.x = std::min(grid.x, threads.x);
+        grid.y = std::min(grid.y, threads.y);
    }


@ -850,7 +840,7 @@ namespace cv { namespace gpu { namespace mathfunc

        sumInSmem<nthreads, uint>(scount, tid);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
 		__shared__ bool is_last;

 		if (tid == 0)
@ -957,15 +947,14 @@ namespace cv { namespace gpu { namespace mathfunc
    template int countNonZeroMultipassCaller<int>(const DevMem2Db, PtrStepb);
    template int countNonZeroMultipassCaller<float>(const DevMem2Db, PtrStepb);

-    } // namespace countnonzero
+} // namespace countnonzero


-    //////////////////////////////////////////////////////////////////////////
-    // Sum
-
-    namespace sums 
-    {
+//////////////////////////////////////////////////////////////////////////
+// Sum

+namespace sum
+{
    template <typename T> struct SumType {};
    template <> struct SumType<uchar> { typedef uint R; };
    template <> struct SumType<char> { typedef int R; };
@ -979,7 +968,7 @@ namespace cv { namespace gpu { namespace mathfunc
    struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };

    template <typename R> 
-    struct AbsOp { static __device__ __forceinline__ R call(R x) { return abs(x); } };
+    struct AbsOp { static __device__ __forceinline__ R call(R x) { return ::abs(x); } };

    template <>
    struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };
@ -999,8 +988,8 @@ namespace cv { namespace gpu { namespace mathfunc
        threads = dim3(threads_x, threads_y);
        grid = dim3(divUp(cols, threads.x * threads.y), 
                    divUp(rows, threads.y * threads.x));
-        grid.x = min(grid.x, threads.x);
-        grid.y = min(grid.y, threads.y);
+        grid.x = std::min(grid.x, threads.x);
+        grid.y = std::min(grid.y, threads.y);
    }


@ -1044,7 +1033,7 @@ namespace cv { namespace gpu { namespace mathfunc

        sumInSmem<nthreads, R>(smem, tid);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
        __shared__ bool is_last;

        if (tid == 0)
@ -1125,7 +1114,7 @@ namespace cv { namespace gpu { namespace mathfunc
        sumInSmem<nthreads, R>(smem, tid);
        sumInSmem<nthreads, R>(smem + nthreads, tid);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
        __shared__ bool is_last;

        if (tid == 0)
@ -1232,7 +1221,7 @@ namespace cv { namespace gpu { namespace mathfunc
        sumInSmem<nthreads, R>(smem + nthreads, tid);
        sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
        __shared__ bool is_last;

        if (tid == 0)
@ -1349,7 +1338,7 @@ namespace cv { namespace gpu { namespace mathfunc
        sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
        sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
+#if __CUDA_ARCH__ >= 110
        __shared__ bool is_last;

        if (tid == 0)
@ -1437,13 +1426,9 @@ namespace cv { namespace gpu { namespace mathfunc
        }
    }

-    } // namespace sum
-
-
    template <typename T>
    void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
    {
-        using namespace sums;
        typedef typename SumType<T>::R R;

        dim3 threads, grid;
@ -1515,7 +1500,6 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
    {
-        using namespace sums;
        typedef typename SumType<T>::R R;

        dim3 threads, grid;
@ -1565,7 +1549,6 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
    {
-        using namespace sums;
        typedef typename SumType<T>::R R;

        dim3 threads, grid;
@ -1637,7 +1620,6 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
    {
-        using namespace sums;
        typedef typename SumType<T>::R R;

        dim3 threads, grid;
@ -1687,7 +1669,6 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
    {
-        using namespace sums;
        typedef typename SumType<T>::R R;

        dim3 threads, grid;
@ -1759,7 +1740,6 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn)
    {
-        using namespace sums;
        typedef typename SumType<T>::R R;

        dim3 threads, grid;
@ -1804,301 +1784,305 @@ namespace cv { namespace gpu { namespace mathfunc
    template void sqrSumCaller<short>(const DevMem2Db, PtrStepb, double*, int);
    template void sqrSumCaller<int>(const DevMem2Db, PtrStepb, double*, int);
    template void sqrSumCaller<float>(const DevMem2Db, PtrStepb, double*, int);
+} // namespace sum

-    //////////////////////////////////////////////////////////////////////////////
-    // reduce
+//////////////////////////////////////////////////////////////////////////////
+// reduce

-    template <typename S> struct SumReductor
+template <typename S> struct SumReductor
+{
+    __device__ __forceinline__ S startValue() const
    {
-        __device__ __forceinline__ S startValue() const
-        {
-            return 0;
-        }
-
-        __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
-        {
-            return a + b;
-        }
-
-        __device__ __forceinline__ S result(S r, double) const
-        {
-            return r;
-        }
-    };
-
-    template <typename S> struct AvgReductor
-    {
-        __device__ __forceinline__ S startValue() const
-        {
-            return 0;
-        }
-
-        __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
-        {
-            return a + b;
-        }
-
-        __device__ __forceinline__ double result(S r, double sz) const
-        {
-            return r / sz;
-        }
-    };
-
-    template <typename S> struct MinReductor
-    {
-        __device__ __forceinline__ S startValue() const
-        {
-            return numeric_limits<S>::max();
-        }
-
-        template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
-        {
-            return saturate_cast<T>(::min(a, b));
-        }
-        __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
-        {
-            return ::fmin(a, b);
-        }
-
-        __device__ __forceinline__ S result(S r, double) const
-        {
-            return r;
-        }
-    };
-
-    template <typename S> struct MaxReductor
-    {
-        __device__ __forceinline__ S startValue() const
-        {
-            return numeric_limits<S>::min();
-        }
-
-        template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
-        {
-            return ::max(a, b);
-        }
-        __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
-        {
-            return ::fmax(a, b);
-        }
-
-        __device__ __forceinline__ S result(S r, double) const
-        {
-            return r;
-        }
-    };
-
-    template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
-    {
-        __shared__ S smem[16 * 16];
-
-        const int x = blockIdx.x * 16 + threadIdx.x;
-
-        S myVal = op.startValue();
-
-        if (x < src.cols)
-        {
-            for (int y = threadIdx.y; y < src.rows; y += 16)
-                myVal = op(myVal, src.ptr(y)[x]);
-        }        
-
-        smem[threadIdx.x * 16 + threadIdx.y] = myVal;
-        __syncthreads();
-
-        if (threadIdx.x < 8)
-        {
-            volatile S* srow = smem + threadIdx.y * 16;
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
-            srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
-        }
-        __syncthreads();
-
-        if (threadIdx.y == 0 && x < src.cols)
-            dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
+        return 0;
    }

-    template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+    __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(src.cols, block.x));
-
-        Op<S> op;
-        reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-
+        return a + b;
    }

-    template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+    __device__ __forceinline__ S result(S r, double) const
    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+        return r;
+    }
+};

-        static const caller_t callers[] = 
-        {
-            reduceRows_caller<SumReductor, T, S, D>, 
-            reduceRows_caller<AvgReductor, T, S, D>, 
-            reduceRows_caller<MaxReductor, T, S, D>, 
-            reduceRows_caller<MinReductor, T, S, D>
-        };
-
-        callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+template <typename S> struct AvgReductor
+{
+    __device__ __forceinline__ S startValue() const
+    {
+        return 0;
    }

-    template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
-
-    template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
-
-    template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
-
-    template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-
-    template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-
-
-
-    template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
+    __device__ __forceinline__ S operator ()(volatile S a, volatile S b) const
    {
-        __shared__ S smem[256 * cn];
+        return a + b;
+    }

-        const int y = blockIdx.x;
+    __device__ __forceinline__ double result(S r, double sz) const
+    {
+        return r / sz;
+    }
+};

-        const T* src_row = src.ptr(y);
+template <typename S> struct MinReductor
+{
+    __device__ __forceinline__ S startValue() const
+    {
+        return numeric_limits<S>::max();
+    }

-        S myVal[cn];
+    template <typename T> __device__ __forceinline__ T operator ()(volatile T a, volatile T b) const
+    {
+        return saturate_cast<T>(::min(a, b));
+    }
+    __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
+    {
+        return ::fmin(a, b);
+    }

-        #pragma unroll
-        for (int c = 0; c < cn; ++c)
-            myVal[c] = op.startValue();
+    __device__ __forceinline__ S result(S r, double) const
+    {
+        return r;
+    }
+};
+
+template <typename S> struct MaxReductor
+{
+    __device__ __forceinline__ S startValue() const
+    {
+        return numeric_limits<S>::min();
+    }
+
+    template <typename T> __device__ __forceinline__ int operator ()(volatile T a, volatile T b) const
+    {
+        return ::max(a, b);
+    }
+    __device__ __forceinline__ float operator ()(volatile float a, volatile float b) const
+    {
+        return ::fmax(a, b);
+    }
+
+    __device__ __forceinline__ S result(S r, double) const
+    {
+        return r;
+    }
+};
+
+template <class Op, typename T, typename S, typename D> __global__ void reduceRows(const DevMem2D_<T> src, D* dst, const Op op)
+{
+    __shared__ S smem[16 * 16];
+
+    const int x = blockIdx.x * 16 + threadIdx.x;
+
+    S myVal = op.startValue();
+
+    if (x < src.cols)
+    {
+        for (int y = threadIdx.y; y < src.rows; y += 16)
+            myVal = op(myVal, src.ptr(y)[x]);
+    }        
+
+    smem[threadIdx.x * 16 + threadIdx.y] = myVal;
+    __syncthreads();
+
+    if (threadIdx.x < 8)
+    {
+        volatile S* srow = smem + threadIdx.y * 16;
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 8]);
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 4]);
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 2]);
+        srow[threadIdx.x] = op(srow[threadIdx.x], srow[threadIdx.x + 1]);
+    }
+    __syncthreads();
+
+    if (threadIdx.y == 0 && x < src.cols)
+        dst[x] = saturate_cast<D>(op.result(smem[threadIdx.x * 16], src.rows));
+}
+
+template <template <typename> class Op, typename T, typename S, typename D> void reduceRows_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+{
+    const dim3 block(16, 16);
+    const dim3 grid(divUp(src.cols, block.x));
+
+    Op<S> op;
+    reduceRows<Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+}
+
+template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        reduceRows_caller<SumReductor, T, S, D>, 
+        reduceRows_caller<AvgReductor, T, S, D>, 
+        reduceRows_caller<MaxReductor, T, S, D>, 
+        reduceRows_caller<MinReductor, T, S, D>
+    };
+
+    callers[reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+}
+
+template void reduceRows_gpu<uchar, int, uchar>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<uchar, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<uchar, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+
+template void reduceRows_gpu<ushort, int, ushort>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<ushort, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<ushort, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+
+template void reduceRows_gpu<short, int, short>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<short, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<short, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+
+template void reduceRows_gpu<int, int, int>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceRows_gpu<int, int, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+
+template void reduceRows_gpu<float, float, float>(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+
+
+
+template <int cn, class Op, typename T, typename S, typename D> __global__ void reduceCols(const DevMem2D_<T> src, D* dst, const Op op)
+{
+    __shared__ S smem[256 * cn];
+
+    const int y = blockIdx.x;
+
+    const T* src_row = src.ptr(y);
+
+    S myVal[cn];
+
+    #pragma unroll
+    for (int c = 0; c < cn; ++c)
+        myVal[c] = op.startValue();

 #if __CUDA_ARCH__ >= 200

-        // For cc >= 2.0 prefer L1 cache
-        for (int x = threadIdx.x; x < src.cols; x += 256)
-        {
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                myVal[c] = op(myVal[c], src_row[x * cn + c]);
-        }
+    // For cc >= 2.0 prefer L1 cache
+    for (int x = threadIdx.x; x < src.cols; x += 256)
+    {
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            myVal[c] = op(myVal[c], src_row[x * cn + c]);
+    }

 #else // __CUDA_ARCH__ >= 200

-        // For older arch use shared memory for cache
-        for (int x = 0; x < src.cols; x += 256)
+    // For older arch use shared memory for cache
+    for (int x = 0; x < src.cols; x += 256)
+    {
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
        {
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-            {
-                smem[c * 256 + threadIdx.x] = op.startValue();
-                const int load_x = x * cn + c * 256 + threadIdx.x;
-                if (load_x < src.cols * cn)
-                    smem[c * 256 + threadIdx.x] = src_row[load_x];
-            }
-            __syncthreads();
-
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
-            __syncthreads();
+            smem[c * 256 + threadIdx.x] = op.startValue();
+            const int load_x = x * cn + c * 256 + threadIdx.x;
+            if (load_x < src.cols * cn)
+                smem[c * 256 + threadIdx.x] = src_row[load_x];
        }
-
-#endif // __CUDA_ARCH__ >= 200
+        __syncthreads();

        #pragma unroll
        for (int c = 0; c < cn; ++c)
-            smem[c * 256 + threadIdx.x] = myVal[c];
+            myVal[c] = op(myVal[c], smem[threadIdx.x * cn + c]);
        __syncthreads();
-
-        if (threadIdx.x < 128)
-        {
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
-        }
-        __syncthreads();
-
-        if (threadIdx.x < 64)
-        {
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-                smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
-        }
-        __syncthreads();
-
-        volatile S* sdata = smem;
-
-        if (threadIdx.x < 32)
-        {
-            #pragma unroll
-            for (int c = 0; c < cn; ++c)
-            {
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
-                sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
-            }
-        }
-        __syncthreads();
-
-        if (threadIdx.x < cn)
-            dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
    }

-    template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+#endif // __CUDA_ARCH__ >= 200
+
+    #pragma unroll
+    for (int c = 0; c < cn; ++c)
+        smem[c * 256 + threadIdx.x] = myVal[c];
+    __syncthreads();
+
+    if (threadIdx.x < 128)
    {
-        const dim3 block(256);
-        const dim3 grid(src.rows);
-
-        Op<S> op;
-        reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 128]);
    }
+    __syncthreads();

-    template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+    if (threadIdx.x < 64)
    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
-
-        static const caller_t callers[4][4] = 
-        {
-            {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
-            {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
-            {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
-            {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
-        };
-
-        callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+            smem[c * 256 + threadIdx.x] = op(smem[c * 256 + threadIdx.x], smem[c * 256 + threadIdx.x + 64]);
    }
+    __syncthreads();

-    template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-    template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+    volatile S* sdata = smem;

-    template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
-    template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-    template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+    if (threadIdx.x < 32)
+    {
+        #pragma unroll
+        for (int c = 0; c < cn; ++c)
+        {
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 32]);
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 16]);
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 8]);
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 4]);
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 2]);
+            sdata[c * 256 + threadIdx.x] = op(sdata[c * 256 + threadIdx.x], sdata[c * 256 + threadIdx.x + 1]);
+        }
+    }
+    __syncthreads();

-    template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
-    template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-    template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+    if (threadIdx.x < cn)
+        dst[y * cn + threadIdx.x] = saturate_cast<D>(op.result(smem[threadIdx.x * 256], src.cols));
+}

-    template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
-    template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template <int cn, template <typename> class Op, typename T, typename S, typename D> void reduceCols_caller(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream)
+{
+    const dim3 block(256);
+    const dim3 grid(src.rows);

-    template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
- }}}
+    Op<S> op;
+    reduceCols<cn, Op<S>, T, S, D><<<grid, block, 0, stream>>>(src, dst.data, op);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+
+}
+
+template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2D_<T>& src, DevMem2D_<D> dst, cudaStream_t stream);
+
+    static const caller_t callers[4][4] = 
+    {
+        {reduceCols_caller<1, SumReductor, T, S, D>, reduceCols_caller<1, AvgReductor, T, S, D>, reduceCols_caller<1, MaxReductor, T, S, D>, reduceCols_caller<1, MinReductor, T, S, D>},
+        {reduceCols_caller<2, SumReductor, T, S, D>, reduceCols_caller<2, AvgReductor, T, S, D>, reduceCols_caller<2, MaxReductor, T, S, D>, reduceCols_caller<2, MinReductor, T, S, D>},
+        {reduceCols_caller<3, SumReductor, T, S, D>, reduceCols_caller<3, AvgReductor, T, S, D>, reduceCols_caller<3, MaxReductor, T, S, D>, reduceCols_caller<3, MinReductor, T, S, D>},
+        {reduceCols_caller<4, SumReductor, T, S, D>, reduceCols_caller<4, AvgReductor, T, S, D>, reduceCols_caller<4, MaxReductor, T, S, D>, reduceCols_caller<4, MinReductor, T, S, D>},
+    };
+
+    callers[cn - 1][reduceOp](static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<D> >(dst), stream);
+}
+
+template void reduceCols_gpu<uchar, int, uchar>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceCols_gpu<uchar, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+template void reduceCols_gpu<uchar, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+
+template void reduceCols_gpu<ushort, int, ushort>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream); 
+template void reduceCols_gpu<ushort, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+template void reduceCols_gpu<ushort, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+
+template void reduceCols_gpu<short, int, short>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+template void reduceCols_gpu<short, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+template void reduceCols_gpu<short, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);  
+
+template void reduceCols_gpu<int, int, int>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);                  
+template void reduceCols_gpu<int, int, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+
+template void reduceCols_gpu<float, float, float>(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
+
+} // namespace mattrix_reductions
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@ -46,140 +46,142 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace imgproc
+namespace pyr_down {
+
+template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
 {
-    template <typename T, typename B> __global__ void pyrDown(const PtrStep<T> src, PtrStep<T> dst, const B b, int dst_cols)
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y;
+
+    __shared__ value_type smem[256 + 4];
+
+    value_type sum;
+    
+    const int src_y = 2*y;
+
+    sum = VecTraits<value_type>::all(0);
+    
+    sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
+    sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
+    sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
+    sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
+    sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
+
+    smem[2 + threadIdx.x] = sum;
+
+    if (threadIdx.x < 2)
    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y;
-
-        __shared__ value_type smem[256 + 4];
-
-        value_type sum;
-        
-        const int src_y = 2*y;
+        const int left_x = x - 2 + threadIdx.x;

        sum = VecTraits<value_type>::all(0);
    
-        sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
-        sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
-        sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
-        sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
+        sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
+        sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
+        sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);

-        smem[2 + threadIdx.x] = sum;
-
-        if (threadIdx.x < 2)
-        {
-            const int left_x = x - 2 + threadIdx.x;
-
-            sum = VecTraits<value_type>::all(0);
-        
-            sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
-            sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
-            sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
-
-            smem[threadIdx.x] = sum;
-        }
-
-        if (threadIdx.x > 253)
-        {
-            const int right_x = x + threadIdx.x + 2;
-
-            sum = VecTraits<value_type>::all(0);
-        
-            sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
-            sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
-            sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
-            sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
-
-            smem[4 + threadIdx.x] = sum;
-        }
-
-        __syncthreads();
-
-        if (threadIdx.x < 128)
-        {
-            const int tid2 = threadIdx.x * 2;
-
-            sum = VecTraits<value_type>::all(0);
-
-            sum = sum + 0.0625f * smem[2 + tid2 - 2];
-            sum = sum + 0.25f   * smem[2 + tid2 - 1];
-            sum = sum + 0.375f  * smem[2 + tid2    ];
-            sum = sum + 0.25f   * smem[2 + tid2 + 1];
-            sum = sum + 0.0625f * smem[2 + tid2 + 2];
-
-            const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
-
-            if (dst_x < dst_cols)
-                dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
-        }
+        smem[threadIdx.x] = sum;
    }

-    template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    if (threadIdx.x > 253)
    {
-        const dim3 block(256);
-        const dim3 grid(divUp(src.cols, block.x), dst.rows);
+        const int right_x = x + threadIdx.x + 2;

-        B<T> b(src.rows, src.cols);
+        sum = VecTraits<value_type>::all(0);
    
-        pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-        cudaSafeCall( cudaGetLastError() );
+        sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
+        sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
+        sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
+        sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+        smem[4 + threadIdx.x] = sum;
    }

-    template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+    __syncthreads();
+
+    if (threadIdx.x < 128)
    {
-        typedef typename TypeVec<T, cn>::vec_type type;
+        const int tid2 = threadIdx.x * 2;

-        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+        sum = VecTraits<value_type>::all(0);

-        static const caller_t callers[] = 
-        {
-            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
-        };
+        sum = sum + 0.0625f * smem[2 + tid2 - 2];
+        sum = sum + 0.25f   * smem[2 + tid2 - 1];
+        sum = sum + 0.375f  * smem[2 + tid2    ];
+        sum = sum + 0.25f   * smem[2 + tid2 + 1];
+        sum = sum + 0.0625f * smem[2 + tid2 + 2];

-        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+        const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
+
+        if (dst_x < dst_cols)
+            dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
    }
+}

-    template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+{
+    const dim3 block(256);
+    const dim3 grid(divUp(src.cols, block.x), dst.rows);

-    template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    B<T> b(src.rows, src.cols);

-    template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
+    cudaSafeCall( cudaGetLastError() );

-    template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

-    template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+{
+    typedef typename TypeVec<T, cn>::vec_type type;

-    template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-}}}
+    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+
+    static const caller_t callers[] = 
+    {
+        pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>, pyrDown_caller<type, BrdReflect>, pyrDown_caller<type, BrdWrap>
+    };
+
+    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+}
+
+template void pyrDown_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+template void pyrDown_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+template void pyrDown_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+template void pyrDown_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+template void pyrDown_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+template void pyrDown_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrDown_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+
+} // namespace pyr_down
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@ -46,135 +46,137 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace imgproc
+namespace pyr_up {
+
+template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
 {
-    template <typename T, typename B> __global__ void pyrUp(const PtrStep<T> src, DevMem2D_<T> dst, const B b)
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    __shared__ T smem1[10][10];
+    __shared__ value_type smem2[20][16];
+
+    value_type sum;
+
+    if (threadIdx.x < 10 && threadIdx.y < 10)
+        smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
+
+    __syncthreads();
+
+    const int tidx = threadIdx.x;
+
+    sum = VecTraits<value_type>::all(0);
+
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
+    sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
+
+    smem2[2 + threadIdx.y][tidx] = sum;
+
+    if (threadIdx.y < 2)
    {
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        __shared__ T smem1[10][10];
-        __shared__ value_type smem2[20][16];
-
-        value_type sum;
-
-        if (threadIdx.x < 10 && threadIdx.y < 10)
-            smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
-
-        __syncthreads();
-
-        const int tidx = threadIdx.x;
-
        sum = VecTraits<value_type>::all(0);

-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx - 2) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx - 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[1 + threadIdx.y / 2][1 + ((tidx    ) >> 1)];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[1 + threadIdx.y / 2][1 + ((tidx + 1) >> 1)];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[1 + threadIdx.y / 2][1 + ((tidx + 2) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];

-        smem2[2 + threadIdx.y][tidx] = sum;
-
-        if (threadIdx.y < 2)
-        {
-            sum = VecTraits<value_type>::all(0);
-
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx - 2) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx - 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[0][1 + ((tidx    ) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[0][1 + ((tidx + 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[0][1 + ((tidx + 2) >> 1)];
-
-            smem2[threadIdx.y][tidx] = sum;
-        }
-
-        if (threadIdx.y > 13)
-        {
-            sum = VecTraits<value_type>::all(0);
-
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
-            sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
-            sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];
-
-            smem2[4 + threadIdx.y][tidx] = sum;
-        }
-
-        __syncthreads();
+        smem2[threadIdx.y][tidx] = sum;
+    }

+    if (threadIdx.y > 13)
+    {
        sum = VecTraits<value_type>::all(0);

-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
-        sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
-        sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
-        sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx - 2) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx - 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.375f  * smem1[9][1 + ((tidx    ) >> 1)];
+        sum = sum + (tidx % 2 != 0) * 0.25f   * smem1[9][1 + ((tidx + 1) >> 1)];
+        sum = sum + (tidx % 2 == 0) * 0.0625f * smem1[9][1 + ((tidx + 2) >> 1)];

-        if (x < dst.cols && y < dst.rows)
-            dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
+        smem2[4 + threadIdx.y][tidx] = sum;
    }

-    template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+    __syncthreads();
+
+    sum = VecTraits<value_type>::all(0);
+
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y - 2][tidx];
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y - 1][tidx];
+    sum = sum + (tidx % 2 == 0) * 0.375f  * smem2[2 + threadIdx.y    ][tidx];
+    sum = sum + (tidx % 2 != 0) * 0.25f   * smem2[2 + threadIdx.y + 1][tidx];
+    sum = sum + (tidx % 2 == 0) * 0.0625f * smem2[2 + threadIdx.y + 2][tidx];
+
+    if (x < dst.cols && y < dst.rows)
+        dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
+}
+
+template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
+{
+    const dim3 block(16, 16);
+    const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+
+    B<T> b(src.rows, src.cols);
+
+    pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
+{
+    typedef typename TypeVec<T, cn>::vec_type type;
+
+    typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+
+    static const caller_t callers[] = 
    {
-        const dim3 block(16, 16);
-        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
+    };

-        B<T> b(src.rows, src.cols);
+    callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+}

-        pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
-        cudaSafeCall( cudaGetLastError() );
+template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-    template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream)
-    {
-        typedef typename TypeVec<T, cn>::vec_type type;
+template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-        static const caller_t callers[] = 
-        {
-            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>, pyrUp_caller<type, BrdReflect>, pyrUp_caller<type, BrdWrap>
-        };
+template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-    }
+template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

-    template void pyrUp_gpu<uchar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<uchar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<uchar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<uchar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
+} // namespace pyr_up

-    template void pyrUp_gpu<schar, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<ushort, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<short, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<int, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-
-    template void pyrUp_gpu<float, 1>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 2>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 3>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 4>(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-}}}
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
@ -47,64 +47,62 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace imgproc
+namespace remap {
+    
+template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
 {
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;

-    template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
+    if (x < dst.cols && y < dst.rows)
    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        const float xcoo = mapx.ptr(y)[x];
+        const float ycoo = mapy.ptr(y)[x];

-        if (x < dst.cols && y < dst.rows)
-        {
-            const float xcoo = mapx.ptr(y)[x];
-            const float ycoo = mapy.ptr(y)[x];
-
-            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-        }
+        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
    }
+}

-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherStream
+{
+    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
+        const float* borderValue, cudaStream_t stream, int)
    {
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
-            const float* borderValue, cudaStream_t stream, int)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
        
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-            BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);

-            remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
-            cudaSafeCall( cudaGetLastError() );
-        }
-    };
+        remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
+        cudaSafeCall( cudaGetLastError() );
+    }
+};

-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStream
+{
+    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
    {
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, const float* borderValue, int)
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type; 
        
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-            BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+        B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+        BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
+        Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);

-            remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
-            cudaSafeCall( cudaGetLastError() );
+        remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+        cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};

 #define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type) \
    texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
@ -124,7 +122,7 @@ namespace cv { namespace gpu { namespace imgproc
            typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
            dim3 block(32, cc >= 20 ? 8 : 4); \
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_remap_ ## type , src); \
+            bindTexture(&tex_remap_ ## type , src); \
            tex_remap_ ## type ##_reader texSrc; \
            B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
            BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
@ -140,7 +138,7 @@ namespace cv { namespace gpu { namespace imgproc
        { \
            dim3 block(32, 8); \
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_remap_ ## type , src); \
+            bindTexture(&tex_remap_ ## type , src); \
            tex_remap_ ## type ##_reader texSrc; \
            Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
            remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
@ -149,105 +147,108 @@ namespace cv { namespace gpu { namespace imgproc
        } \
    };
    
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar2)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar4)

-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(schar)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char2)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(char4)

-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort2)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(ushort4)

-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(short)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(short2)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(short4)

-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int2)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(int4)

-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
-    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
-    OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(float)
+//OPENCV_GPU_IMPLEMENT_REMAP_TEX(float2)
+OPENCV_GPU_IMPLEMENT_REMAP_TEX(float4)

 #undef OPENCV_GPU_IMPLEMENT_REMAP_TEX

-    template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
+{ 
+    static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
+        const float* borderValue, cudaStream_t stream, int cc)
+    {
+        if (stream == 0)
+            RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
+        else
+            RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+    }
+};
+
+template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
+    int borderMode, const float* borderValue, cudaStream_t stream, int cc)
+{
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, 
+        const float* borderValue, cudaStream_t stream, int cc);
+
+    static const caller_t callers[3][5] = 
    {
-        static void call(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, 
-            const float* borderValue, cudaStream_t stream, int cc)
        { 
-            if (stream == 0)
-                RemapDispatcherNonStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, cc);
-            else
-                RemapDispatcherStream<Filter, B, T>::call(src, mapx, mapy, dst, borderValue, stream, cc);
+            RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
+            RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
+            RemapDispatcher<PointFilter, BrdConstant, T>::call, 
+            RemapDispatcher<PointFilter, BrdReflect, T>::call, 
+            RemapDispatcher<PointFilter, BrdWrap, T>::call 
+        },
+        { 
+            RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
+            RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
+            RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
+            RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
+            RemapDispatcher<LinearFilter, BrdWrap, T>::call 
+        },
+        { 
+            RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
+            RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
+            RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
+            RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
+            RemapDispatcher<CubicFilter, BrdWrap, T>::call 
        }
    };

-    template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
-        int borderMode, const float* borderValue, cudaStream_t stream, int cc)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, 
-            const float* borderValue, cudaStream_t stream, int cc);
+    callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
+}

-        static const caller_t callers[3][5] = 
-        {
-            { 
-                RemapDispatcher<PointFilter, BrdReflect101, T>::call, 
-                RemapDispatcher<PointFilter, BrdReplicate, T>::call, 
-                RemapDispatcher<PointFilter, BrdConstant, T>::call, 
-                RemapDispatcher<PointFilter, BrdReflect, T>::call, 
-                RemapDispatcher<PointFilter, BrdWrap, T>::call 
-            },
-            { 
-                RemapDispatcher<LinearFilter, BrdReflect101, T>::call, 
-                RemapDispatcher<LinearFilter, BrdReplicate, T>::call, 
-                RemapDispatcher<LinearFilter, BrdConstant, T>::call, 
-                RemapDispatcher<LinearFilter, BrdReflect, T>::call, 
-                RemapDispatcher<LinearFilter, BrdWrap, T>::call 
-            },
-            { 
-                RemapDispatcher<CubicFilter, BrdReflect101, T>::call, 
-                RemapDispatcher<CubicFilter, BrdReplicate, T>::call, 
-                RemapDispatcher<CubicFilter, BrdConstant, T>::call, 
-                RemapDispatcher<CubicFilter, BrdReflect, T>::call, 
-                RemapDispatcher<CubicFilter, BrdWrap, T>::call 
-            }
-        };
+template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-        callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), borderValue, stream, cc);
-    }
+//template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-    template void remap_gpu<uchar >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<uchar2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<uchar3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<uchar4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-    //template void remap_gpu<schar>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<char2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<char3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<char4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-    template void remap_gpu<ushort >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<ushort2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<ushort3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<ushort4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-    template void remap_gpu<short >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<short2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<short3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<short4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+//template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);

-    //template void remap_gpu<int >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<int2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<int3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<int4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+} // namespace remap

-    template void remap_gpu<float >(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    //template void remap_gpu<float2>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<float3>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-    template void remap_gpu<float4>(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-}}}
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@ -47,102 +47,100 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace imgproc
+namespace resize {
+    
+template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
 {
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;

-    template <typename Ptr2D, typename T> __global__ void resize(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+    if (x < dst.cols && y < dst.rows)
    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        const float xcoo = x / fx;
+        const float ycoo = y / fy;

-        if (x < dst.cols && y < dst.rows)
-        {
-            const float xcoo = x / fx;
-            const float ycoo = y / fy;
-
-            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-        }
+        dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
    }
-    template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+}
+template <typename Ptr2D, typename T> __global__ void resizeNN(const Ptr2D src, float fx, float fy, DevMem2D_<T> dst)
+{
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (x < dst.cols && y < dst.rows)
    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        const float xcoo = x / fx;
+        const float ycoo = y / fy;

-        if (x < dst.cols && y < dst.rows)
-        {
-            const float xcoo = x / fx;
-            const float ycoo = y / fy;
-
-            dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
-        }
+        dst.ptr(y)[x] = src(__float2int_rd(ycoo), __float2int_rd(xcoo));
    }
+}

-    template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+template <template <typename> class Filter, typename T> struct ResizeDispatcherStream
+{
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
    {            
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);

-            resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
-        }
-    };
-    template <typename T> struct ResizeDispatcherStream<PointFilter, T>
+        resize<<<grid, block, 0, stream>>>(filter_src, fx, fy, dst);
+        cudaSafeCall( cudaGetLastError() );
+    }
+};
+template <typename T> struct ResizeDispatcherStream<PointFilter, T>
+{
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
    {            
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);

-            resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
-        }
-    };
+        resizeNN<<<grid, block, 0, stream>>>(brdSrc, fx, fy, dst);
+        cudaSafeCall( cudaGetLastError() );
+    }
+};

-    template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+template <template <typename> class Filter, typename T> struct ResizeDispatcherNonStream
+{
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
    {            
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-            Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        Filter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filter_src(brdSrc);

-            resize<<<grid, block>>>(filter_src, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
+        resize<<<grid, block>>>(filter_src, fx, fy, dst);
+        cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};
+template <typename T> struct ResizeDispatcherNonStream<PointFilter, T>
+{
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
    {            
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst)
-        {            
-            dim3 block(32, 8);
-            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

-            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
+        BrdReplicate<T> brd(src.rows, src.cols);
+        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);

-            resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
-            cudaSafeCall( cudaGetLastError() );
+        resizeNN<<<grid, block>>>(brdSrc, fx, fy, dst);
+        cudaSafeCall( cudaGetLastError() );

-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
+        cudaSafeCall( cudaDeviceSynchronize() );
+    }
+};

 #define OPENCV_GPU_IMPLEMENT_RESIZE_TEX(type) \
    texture< type , cudaTextureType2D> tex_resize_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
@ -161,7 +159,7 @@ namespace cv { namespace gpu { namespace imgproc
        { \
            dim3 block(32, 8); \
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_resize_ ## type , src); \
+            bindTexture(&tex_resize_ ## type , src); \
            tex_resize_ ## type ##_reader texSrc; \
            Filter< tex_resize_ ## type ##_reader > filter_src(texSrc); \
            resize<<<grid, block>>>(filter_src, fx, fy, dst); \
@ -175,7 +173,7 @@ namespace cv { namespace gpu { namespace imgproc
        { \
            dim3 block(32, 8); \
            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-            TextureBinder texHandler(&tex_resize_ ## type , src); \
+            bindTexture(&tex_resize_ ## type , src); \
            tex_resize_ ## type ##_reader texSrc; \
            resizeNN<<<grid, block>>>(texSrc, fx, fy, dst); \
            cudaSafeCall( cudaGetLastError() ); \
@ -183,82 +181,85 @@ namespace cv { namespace gpu { namespace imgproc
        } \
    };
    
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar2)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(uchar4)

-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(schar)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char2)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(char4)

-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort2)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(ushort4)

-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short2)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(short4)

-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int2)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(int4)

-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
-    //OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
-    OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float)
+//OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float2)
+OPENCV_GPU_IMPLEMENT_RESIZE_TEX(float4)

 #undef OPENCV_GPU_IMPLEMENT_RESIZE_TEX

-    template <template <typename> class Filter, typename T> struct ResizeDispatcher
+template <template <typename> class Filter, typename T> struct ResizeDispatcher
+{ 
+    static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
-        static void call(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream)
-        {
-            if (stream == 0)
-                ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
-            else
-                ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
-        }
+        if (stream == 0)
+            ResizeDispatcherNonStream<Filter, T>::call(src, fx, fy, dst);
+        else
+            ResizeDispatcherStream<Filter, T>::call(src, fx, fy, dst, stream);
+    }
+};
+
+template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+    static const caller_t callers[3] = 
+    {
+        ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
    };

-    template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, float fx, float fy, const DevMem2D_<T>& dst, cudaStream_t stream);
+    callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
+}

-        static const caller_t callers[3] = 
-        {
-            ResizeDispatcher<PointFilter, T>::call, ResizeDispatcher<LinearFilter, T>::call, ResizeDispatcher<CubicFilter, T>::call
-        };
+template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-        callers[interpolation](static_cast< DevMem2D_<T> >(src), fx, fy, static_cast< DevMem2D_<T> >(dst), stream);
-    }
+//template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-    template void resize_gpu<uchar >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<uchar2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uchar3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<uchar4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-    //template void resize_gpu<schar>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<char2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<char3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<char4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-    template void resize_gpu<ushort >(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<ushort2>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<ushort3>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<ushort4>(const DevMem2Db& src,float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-    template void resize_gpu<short >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<short2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<short3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<short4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+//template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);

-    //template void resize_gpu<int >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<int2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<int3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<int4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
+} // namespace resize

-    template void resize_gpu<float >(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    //template void resize_gpu<float2>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<float3>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-    template void resize_gpu<float4>(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-}}}
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@ -47,8 +47,7 @@
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE

 #define MAX_KERNEL_SIZE 16
 #define BLOCK_DIM_X 16
@ -56,218 +55,218 @@ using namespace cv::gpu::device;
 #define RESULT_STEPS 8
 #define HALO_STEPS 1

-namespace filter_row
+namespace row_filter {
+
+__constant__ float c_kernel[MAX_KERNEL_SIZE];
+
+void loadKernel(const float kernel[], int ksize)
 {
-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
+    cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
+}

-    void loadKernel(const float kernel[], int ksize)
+namespace detail
+{
+    template <typename T, size_t size> struct SmemType
    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_kernel, kernel, ksize * sizeof(float)) );
-    }
-
-    namespace detail
-    {
-        template <typename T, size_t size> struct SmemType
-        {
-            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
-        };
-
-        template <typename T> struct SmemType<T, 4>
-        {
-            typedef T smem_t;
-        };
-    }
-
-    template <typename T> struct SmemType
-    {
-        typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
+        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type smem_t;
    };

-    template <int KERNEL_SIZE, typename T, typename D, typename B>
-    __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+    template <typename T> struct SmemType<T, 4>
    {
-        typedef typename SmemType<T>::smem_t smem_t;
-        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
+        typedef T smem_t;
+    };
+}

-        __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
+template <typename T> struct SmemType
+{
+    typedef typename detail::SmemType<T, sizeof(T)>::smem_t smem_t;
+};

-        //Offset to the left halo edge
-        const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
-        const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+template <int KERNEL_SIZE, typename T, typename D, typename B>
+__global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, int anchor, const B b)
+{
+    typedef typename SmemType<T>::smem_t smem_t;
+    typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;

-        if (y < src.rows)
+    __shared__ smem_t smem[BLOCK_DIM_Y][(RESULT_STEPS + 2 * HALO_STEPS) * BLOCK_DIM_X];
+
+    //Offset to the left halo edge
+    const int x = (blockIdx.x * RESULT_STEPS - HALO_STEPS) * BLOCK_DIM_X + threadIdx.x;
+    const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
+
+    if (y < src.rows)
+    {
+        const T* src_row = src.ptr(y);
+
+        //Load main data
+        #pragma unroll
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
+            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
+
+        //Load left halo
+        #pragma unroll
+        for(int i = 0; i < HALO_STEPS; ++i)
+            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
+
+        //Load right halo
+        #pragma unroll
+        for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
+            smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
+
+        __syncthreads();
+
+        D* dst_row = dst.ptr(y);
+
+        #pragma unroll
+        for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
        {
-            const T* src_row = src.ptr(y);
-
-            //Load main data
-            #pragma unroll
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-                smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
-
-            //Load left halo
-            #pragma unroll
-            for(int i = 0; i < HALO_STEPS; ++i)
-                smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_low(i * BLOCK_DIM_X + x, src_row);
-
-            //Load right halo
-            #pragma unroll
-            for(int i = HALO_STEPS + RESULT_STEPS; i < HALO_STEPS + RESULT_STEPS + HALO_STEPS; ++i)
-                smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X] = b.at_high(i * BLOCK_DIM_X + x, src_row);
-
-            __syncthreads();
-
-            D* dst_row = dst.ptr(y);
+            sum_t sum = VecTraits<sum_t>::all(0);

            #pragma unroll
-            for(int i = HALO_STEPS; i < HALO_STEPS + RESULT_STEPS; ++i)
-            {
-                sum_t sum = VecTraits<sum_t>::all(0);
+            for (int j = 0; j < KERNEL_SIZE; ++j)
+                sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];

-                #pragma unroll
-                for (int j = 0; j < KERNEL_SIZE; ++j)
-                    sum = sum + smem[threadIdx.y][threadIdx.x + i * BLOCK_DIM_X + j - anchor] * c_kernel[j];
+            int dstX = x + i * BLOCK_DIM_X;

-                int dstX = x + i * BLOCK_DIM_X;
-
-                if (dstX < src.cols)
-                    dst_row[dstX] = saturate_cast<D>(sum);
-            }
+            if (dstX < src.cols)
+                dst_row[dstX] = saturate_cast<D>(sum);
        }
    }
 }

-namespace cv { namespace gpu { namespace filters
+template <int ksize, typename T, typename D, template<typename> class B>
+void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
 {
-    template <int ksize, typename T, typename D, template<typename> class B>
-    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream)
+    typedef typename SmemType<T>::smem_t smem_t;
+
+    const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
+    const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+
+    B<smem_t> b(src.cols);
+
+    linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+template <typename T, typename D>
+void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
+{
+    typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
+    static const caller_t callers[5][17] = 
    {
-        typedef typename filter_row::SmemType<T>::smem_t smem_t;
-
-        const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y);
-        const dim3 grid(divUp(src.cols, RESULT_STEPS * BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
-
-        B<smem_t> b(src.cols);
-
-        filter_row::linearRowFilter<ksize, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, b);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-
-    template <typename T, typename D>
-    void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor, cudaStream_t stream);
-        static const caller_t callers[5][17] = 
        {
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
-                linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
-                linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
-                linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<14, T, D, BrdRowReflect101>,
-                linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
-                linearRowFilter_caller<16, T, D, BrdRowReflect101>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
-                linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
-                linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<14, T, D, BrdRowReplicate>,
-                linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
-                linearRowFilter_caller<16, T, D, BrdRowReplicate>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<2 , T, D, BrdRowConstant>,
-                linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
-                linearRowFilter_caller<8 , T, D, BrdRowConstant>,
-                linearRowFilter_caller<9 , T, D, BrdRowConstant>,
-                linearRowFilter_caller<10, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<11, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<12, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<13, T, D, BrdRowConstant>,
-                linearRowFilter_caller<14, T, D, BrdRowConstant>,
-                linearRowFilter_caller<15, T, D, BrdRowConstant>, 
-                linearRowFilter_caller<16, T, D, BrdRowConstant>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<2 , T, D, BrdRowReflect>,
-                linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
-                linearRowFilter_caller<8 , T, D, BrdRowReflect>,
-                linearRowFilter_caller<9 , T, D, BrdRowReflect>,
-                linearRowFilter_caller<10, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<11, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<12, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<13, T, D, BrdRowReflect>,
-                linearRowFilter_caller<14, T, D, BrdRowReflect>,
-                linearRowFilter_caller<15, T, D, BrdRowReflect>, 
-                linearRowFilter_caller<16, T, D, BrdRowReflect>
-            },
-            {
-                0, 
-                linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<2 , T, D, BrdRowWrap>,
-                linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
-                linearRowFilter_caller<8 , T, D, BrdRowWrap>,
-                linearRowFilter_caller<9 , T, D, BrdRowWrap>,
-                linearRowFilter_caller<10, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<11, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<12, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<13, T, D, BrdRowWrap>,
-                linearRowFilter_caller<14, T, D, BrdRowWrap>,
-                linearRowFilter_caller<15, T, D, BrdRowWrap>, 
-                linearRowFilter_caller<16, T, D, BrdRowWrap>
-            }
-        };
+            0, 
+            linearRowFilter_caller<1 , T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<2 , T, D, BrdRowReflect101>,
+            linearRowFilter_caller<3 , T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<4 , T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<5 , T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<6 , T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<7 , T, D, BrdRowReflect101>,
+            linearRowFilter_caller<8 , T, D, BrdRowReflect101>,
+            linearRowFilter_caller<9 , T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<10, T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<11, T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<12, T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<13, T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<14, T, D, BrdRowReflect101>,
+            linearRowFilter_caller<15, T, D, BrdRowReflect101>, 
+            linearRowFilter_caller<16, T, D, BrdRowReflect101>
+        },
+        {
+            0, 
+            linearRowFilter_caller<1 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<2 , T, D, BrdRowReplicate>,
+            linearRowFilter_caller<3 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<4 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<5 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<6 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<7 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<8 , T, D, BrdRowReplicate>,
+            linearRowFilter_caller<9 , T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<10, T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<11, T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<12, T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<13, T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<14, T, D, BrdRowReplicate>,
+            linearRowFilter_caller<15, T, D, BrdRowReplicate>, 
+            linearRowFilter_caller<16, T, D, BrdRowReplicate>
+        },
+        {
+            0, 
+            linearRowFilter_caller<1 , T, D, BrdRowConstant>, 
+            linearRowFilter_caller<2 , T, D, BrdRowConstant>,
+            linearRowFilter_caller<3 , T, D, BrdRowConstant>, 
+            linearRowFilter_caller<4 , T, D, BrdRowConstant>, 
+            linearRowFilter_caller<5 , T, D, BrdRowConstant>, 
+            linearRowFilter_caller<6 , T, D, BrdRowConstant>, 
+            linearRowFilter_caller<7 , T, D, BrdRowConstant>, 
+            linearRowFilter_caller<8 , T, D, BrdRowConstant>,
+            linearRowFilter_caller<9 , T, D, BrdRowConstant>,
+            linearRowFilter_caller<10, T, D, BrdRowConstant>, 
+            linearRowFilter_caller<11, T, D, BrdRowConstant>, 
+            linearRowFilter_caller<12, T, D, BrdRowConstant>, 
+            linearRowFilter_caller<13, T, D, BrdRowConstant>,
+            linearRowFilter_caller<14, T, D, BrdRowConstant>,
+            linearRowFilter_caller<15, T, D, BrdRowConstant>, 
+            linearRowFilter_caller<16, T, D, BrdRowConstant>
+        },
+        {
+            0, 
+            linearRowFilter_caller<1 , T, D, BrdRowReflect>, 
+            linearRowFilter_caller<2 , T, D, BrdRowReflect>,
+            linearRowFilter_caller<3 , T, D, BrdRowReflect>, 
+            linearRowFilter_caller<4 , T, D, BrdRowReflect>, 
+            linearRowFilter_caller<5 , T, D, BrdRowReflect>, 
+            linearRowFilter_caller<6 , T, D, BrdRowReflect>, 
+            linearRowFilter_caller<7 , T, D, BrdRowReflect>, 
+            linearRowFilter_caller<8 , T, D, BrdRowReflect>,
+            linearRowFilter_caller<9 , T, D, BrdRowReflect>,
+            linearRowFilter_caller<10, T, D, BrdRowReflect>, 
+            linearRowFilter_caller<11, T, D, BrdRowReflect>, 
+            linearRowFilter_caller<12, T, D, BrdRowReflect>, 
+            linearRowFilter_caller<13, T, D, BrdRowReflect>,
+            linearRowFilter_caller<14, T, D, BrdRowReflect>,
+            linearRowFilter_caller<15, T, D, BrdRowReflect>, 
+            linearRowFilter_caller<16, T, D, BrdRowReflect>
+        },
+        {
+            0, 
+            linearRowFilter_caller<1 , T, D, BrdRowWrap>, 
+            linearRowFilter_caller<2 , T, D, BrdRowWrap>,
+            linearRowFilter_caller<3 , T, D, BrdRowWrap>, 
+            linearRowFilter_caller<4 , T, D, BrdRowWrap>, 
+            linearRowFilter_caller<5 , T, D, BrdRowWrap>, 
+            linearRowFilter_caller<6 , T, D, BrdRowWrap>, 
+            linearRowFilter_caller<7 , T, D, BrdRowWrap>, 
+            linearRowFilter_caller<8 , T, D, BrdRowWrap>,
+            linearRowFilter_caller<9 , T, D, BrdRowWrap>,
+            linearRowFilter_caller<10, T, D, BrdRowWrap>, 
+            linearRowFilter_caller<11, T, D, BrdRowWrap>, 
+            linearRowFilter_caller<12, T, D, BrdRowWrap>, 
+            linearRowFilter_caller<13, T, D, BrdRowWrap>,
+            linearRowFilter_caller<14, T, D, BrdRowWrap>,
+            linearRowFilter_caller<15, T, D, BrdRowWrap>, 
+            linearRowFilter_caller<16, T, D, BrdRowWrap>
+        }
+    };
    
-        filter_row::loadKernel(kernel, ksize);
+    loadKernel(kernel, ksize);

-        callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
-    }
+    callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, stream);
+}

-    template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    //template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    //template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-    template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}}}
+template void linearRowFilter_gpu<uchar , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearRowFilter_gpu<uchar4, float4>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+//template void linearRowFilter_gpu<short , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+//template void linearRowFilter_gpu<short2, float2>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearRowFilter_gpu<short3, float3>(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearRowFilter_gpu<int   , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+template void linearRowFilter_gpu<float , float >(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+
+} // namespace row_filter
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/safe_call.hpp
+++ b/modules/gpu/src/cuda/safe_call.hpp
@ -43,9 +43,9 @@
 #ifndef __OPENCV_CUDA_SAFE_CALL_HPP__
 #define __OPENCV_CUDA_SAFE_CALL_HPP__

-#include "cuda_runtime_api.h"
-#include "cufft.h"
-#include "cublas.h"
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <cublas.h>
 #include "NCV.hpp"

 #if defined(__GNUC__)
@ -62,46 +62,44 @@
    #define cublasSafeCall(expr)  ___cublasSafeCall(expr, __FILE__, __LINE__)
 #endif

-namespace cv
+namespace cv { namespace gpu {
+
+void error(const char *error_string, const char *file, const int line, const char *func = "");
+void nppError(int err, const char *file, const int line, const char *func = "");
+void ncvError(int err, const char *file, const int line, const char *func = "");
+void cufftError(int err, const char *file, const int line, const char *func = "");
+void cublasError(int err, const char *file, const int line, const char *func = "");
+
+static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
 {
-    namespace gpu
-    {
-        void error(const char *error_string, const char *file, const int line, const char *func = "");
-        void nppError(int err, const char *file, const int line, const char *func = "");
-        void ncvError(int err, const char *file, const int line, const char *func = "");
-        void cufftError(int err, const char *file, const int line, const char *func = "");
-        void cublasError(int err, const char *file, const int line, const char *func = "");
-
-        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
-        {
-            if (cudaSuccess != err)
-                cv::gpu::error(cudaGetErrorString(err), file, line, func);
-        }
-
-        static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
-        {
-            if (err < 0)
-                cv::gpu::nppError(err, file, line, func);
-        }
-
-        static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
-        {
-            if (NCV_SUCCESS != err)
-                cv::gpu::ncvError(err, file, line, func);
-        }
-
-        static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
-        {
-            if (CUFFT_SUCCESS != err)
-                cv::gpu::cufftError(err, file, line, func);
-        }
-
-        static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
-        {
-            if (CUBLAS_STATUS_SUCCESS != err)
-                cv::gpu::cublasError(err, file, line, func);
-        }
-    }
+    if (cudaSuccess != err)
+        cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }

+static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (err < 0)
+        cv::gpu::nppError(err, file, line, func);
+}
+
+static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
+{
+    if (NCV_SUCCESS != err)
+        cv::gpu::ncvError(err, file, line, func);
+}
+
+static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUFFT_SUCCESS != err)
+        cv::gpu::cufftError(err, file, line, func);
+}
+
+static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
+{
+    if (CUBLAS_STATUS_SUCCESS != err)
+        cv::gpu::cublasError(err, file, line, func);
+}
+
+}}
+
 #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@ -42,465 +42,467 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace split_merge {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T, size_t elem_size = sizeof(T)>
-    struct TypeTraits 
+namespace split_merge {
+
+template <typename T, size_t elem_size = sizeof(T)>
+struct TypeTraits 
+{
+    typedef T type;
+    typedef T type2;
+    typedef T type3;
+    typedef T type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 1>
+{
+    typedef char type;
+    typedef char2 type2;
+    typedef char3 type3;
+    typedef char4 type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 2>
+{
+    typedef short type;
+    typedef short2 type2;
+    typedef short3 type3;
+    typedef short4 type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 4> 
+{
+    typedef int type;
+    typedef int2 type2;
+    typedef int3 type3;
+    typedef int4 type4;
+};
+
+template <typename T>
+struct TypeTraits<T, 8> 
+{
+    typedef double type;
+    typedef double2 type2;
+    //typedef double3 type3;
+    //typedef double4 type3;
+};
+
+typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
+typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+
+//------------------------------------------------------------
+// Merge    
+
+template <typename T>
+__global__ void mergeC2_(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    typedef typename TypeTraits<T>::type2 dst_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const T* src0_y = (const T*)(src0 + y * src0_step);
+    const T* src1_y = (const T*)(src1 + y * src1_step);
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
    {                        
-        typedef T type;
-        typedef T type2;
-        typedef T type3;
-        typedef T type4;
+        dst_type dst_elem;
+        dst_elem.x = src0_y[x];
+        dst_elem.y = src1_y[x];
+        dst_y[x] = dst_elem;
+    }
+}
+
+
+template <typename T>
+__global__ void mergeC3_(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    typedef typename TypeTraits<T>::type3 dst_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const T* src0_y = (const T*)(src0 + y * src0_step);
+    const T* src1_y = (const T*)(src1 + y * src1_step);
+    const T* src2_y = (const T*)(src2 + y * src2_step);
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_type dst_elem;
+        dst_elem.x = src0_y[x];
+        dst_elem.y = src1_y[x];
+        dst_elem.z = src2_y[x];
+        dst_y[x] = dst_elem;
+    }
+}
+
+
+template <>
+__global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double* src0_y = (const double*)(src0 + y * src0_step);
+    const double* src1_y = (const double*)(src1 + y * src1_step);
+    const double* src2_y = (const double*)(src2 + y * src2_step);
+    double* dst_y = (double*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_y[3 * x] = src0_y[x];
+        dst_y[3 * x + 1] = src1_y[x];
+        dst_y[3 * x + 2] = src2_y[x];
+    }
+}
+
+
+template <typename T>
+__global__ void mergeC4_(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         const uchar* src3, size_t src3_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    typedef typename TypeTraits<T>::type4 dst_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const T* src0_y = (const T*)(src0 + y * src0_step);
+    const T* src1_y = (const T*)(src1 + y * src1_step);
+    const T* src2_y = (const T*)(src2 + y * src2_step);
+    const T* src3_y = (const T*)(src3 + y * src3_step);
+    dst_type* dst_y = (dst_type*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_type dst_elem;
+        dst_elem.x = src0_y[x];
+        dst_elem.y = src1_y[x];
+        dst_elem.z = src2_y[x];
+        dst_elem.w = src3_y[x];
+        dst_y[x] = dst_elem;
+    }
+}
+
+
+template <>
+__global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
+                         const uchar* src1, size_t src1_step, 
+                         const uchar* src2, size_t src2_step, 
+                         const uchar* src3, size_t src3_step, 
+                         int rows, int cols, uchar* dst, size_t dst_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double* src0_y = (const double*)(src0 + y * src0_step);
+    const double* src1_y = (const double*)(src1 + y * src1_step);
+    const double* src2_y = (const double*)(src2 + y * src2_step);
+    const double* src3_y = (const double*)(src3 + y * src3_step);
+    double2* dst_y = (double2*)(dst + y * dst_step);
+
+    if (x < cols && y < rows) 
+    {                        
+        dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
+        dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
+    }
+}
+
+
+template <typename T>
+static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+    mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            src[0].data, src[0].step,
+            src[1].data, src[1].step,
+            dst.rows, dst.cols, dst.data, dst.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+    mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            src[0].data, src[0].step,
+            src[1].data, src[1].step,
+            src[2].data, src[2].step,
+            dst.rows, dst.cols, dst.data, dst.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+    mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            src[0].data, src[0].step,
+            src[1].data, src[1].step,
+            src[2].data, src[2].step,
+            src[3].data, src[3].step,
+            dst.rows, dst.cols, dst.data, dst.step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
+                             int total_channels, size_t elem_size,
+                             const cudaStream_t& stream)
+{
+    static MergeFunction merge_func_tbl[] =
+    {
+        mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
+        mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
+        mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
    };

-    template <typename T>
-    struct TypeTraits<T, 1>
+    size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
+    MergeFunction merge_func = merge_func_tbl[merge_func_id];
+
+    if (merge_func == 0)
+        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
+
+    merge_func(src, dst, stream);
+}
+
+
+
+//------------------------------------------------------------
+// Split
+
+
+template <typename T>
+__global__ void splitC2_(const uchar* src, size_t src_step, 
+                        int rows, int cols,
+                        uchar* dst0, size_t dst0_step,
+                        uchar* dst1, size_t dst1_step)
+{
+    typedef typename TypeTraits<T>::type2 src_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const src_type* src_y = (const src_type*)(src + y * src_step);
+    T* dst0_y = (T*)(dst0 + y * dst0_step);
+    T* dst1_y = (T*)(dst1 + y * dst1_step);
+
+    if (x < cols && y < rows) 
    {
-        typedef char type;
-        typedef char2 type2;
-        typedef char3 type3;
-        typedef char4 type4;
+        src_type src_elem = src_y[x];
+        dst0_y[x] = src_elem.x;
+        dst1_y[x] = src_elem.y;
+    }
+}
+
+
+template <typename T>
+__global__ void splitC3_(const uchar* src, size_t src_step, 
+                        int rows, int cols,
+                        uchar* dst0, size_t dst0_step,
+                        uchar* dst1, size_t dst1_step,
+                        uchar* dst2, size_t dst2_step)
+{
+    typedef typename TypeTraits<T>::type3 src_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const src_type* src_y = (const src_type*)(src + y * src_step);
+    T* dst0_y = (T*)(dst0 + y * dst0_step);
+    T* dst1_y = (T*)(dst1 + y * dst1_step);
+    T* dst2_y = (T*)(dst2 + y * dst2_step);
+
+    if (x < cols && y < rows) 
+    {
+        src_type src_elem = src_y[x];
+        dst0_y[x] = src_elem.x;
+        dst1_y[x] = src_elem.y;
+        dst2_y[x] = src_elem.z;
+    }
+}
+
+
+template <>
+__global__ void splitC3_<double>(
+        const uchar* src, size_t src_step, int rows, int cols,
+        uchar* dst0, size_t dst0_step,
+        uchar* dst1, size_t dst1_step,
+        uchar* dst2, size_t dst2_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double* src_y = (const double*)(src + y * src_step);
+    double* dst0_y = (double*)(dst0 + y * dst0_step);
+    double* dst1_y = (double*)(dst1 + y * dst1_step);
+    double* dst2_y = (double*)(dst2 + y * dst2_step);
+
+    if (x < cols && y < rows) 
+    {
+        dst0_y[x] = src_y[3 * x];
+        dst1_y[x] = src_y[3 * x + 1];
+        dst2_y[x] = src_y[3 * x + 2];
+    }
+}
+
+
+template <typename T>
+__global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
+                        uchar* dst0, size_t dst0_step,
+                        uchar* dst1, size_t dst1_step,
+                        uchar* dst2, size_t dst2_step,
+                        uchar* dst3, size_t dst3_step)
+{
+    typedef typename TypeTraits<T>::type4 src_type;
+
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const src_type* src_y = (const src_type*)(src + y * src_step);
+    T* dst0_y = (T*)(dst0 + y * dst0_step);
+    T* dst1_y = (T*)(dst1 + y * dst1_step);
+    T* dst2_y = (T*)(dst2 + y * dst2_step);
+    T* dst3_y = (T*)(dst3 + y * dst3_step);
+
+    if (x < cols && y < rows) 
+    {
+        src_type src_elem = src_y[x];
+        dst0_y[x] = src_elem.x;
+        dst1_y[x] = src_elem.y;
+        dst2_y[x] = src_elem.z;
+        dst3_y[x] = src_elem.w;
+    }
+}
+
+
+template <>
+__global__ void splitC4_<double>(
+        const uchar* src, size_t src_step, int rows, int cols,
+        uchar* dst0, size_t dst0_step,
+        uchar* dst1, size_t dst1_step,
+        uchar* dst2, size_t dst2_step,
+        uchar* dst3, size_t dst3_step)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    const double2* src_y = (const double2*)(src + y * src_step);
+    double* dst0_y = (double*)(dst0 + y * dst0_step);
+    double* dst1_y = (double*)(dst1 + y * dst1_step);
+    double* dst2_y = (double*)(dst2 + y * dst2_step);
+    double* dst3_y = (double*)(dst3 + y * dst3_step);
+
+    if (x < cols && y < rows) 
+    {
+        double2 src_elem1 = src_y[2 * x];
+        double2 src_elem2 = src_y[2 * x + 1];
+        dst0_y[x] = src_elem1.x;
+        dst1_y[x] = src_elem1.y;
+        dst2_y[x] = src_elem2.x;
+        dst3_y[x] = src_elem2.y;
+    }
+}
+
+template <typename T>
+static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+    splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            src.data, src.step, src.rows, src.cols,
+            dst[0].data, dst[0].step,
+            dst[1].data, dst[1].step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+    splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            src.data, src.step, src.rows, src.cols,
+            dst[0].data, dst[0].step,
+            dst[1].data, dst[1].step,
+            dst[2].data, dst[2].step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+template <typename T>
+static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
+{
+    dim3 blockDim(32, 8);
+    dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+    splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+             src.data, src.step, src.rows, src.cols,
+             dst[0].data, dst[0].step,
+             dst[1].data, dst[1].step,
+             dst[2].data, dst[2].step,
+             dst[3].data, dst[3].step);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+
+void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream)
+{
+    static SplitFunction split_func_tbl[] =
+    {
+        splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
+        splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
+        splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
    };

-    template <typename T>
-    struct TypeTraits<T, 2>
-    {
-        typedef short type;
-        typedef short2 type2;
-        typedef short3 type3;
-        typedef short4 type4;
-    };
+    size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
+    SplitFunction split_func = split_func_tbl[split_func_id];

-    template <typename T>
-    struct TypeTraits<T, 4> 
-    {
-        typedef int type;
-        typedef int2 type2;
-        typedef int3 type3;
-        typedef int4 type4;
-    };
+    if (split_func == 0)
+        cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);

-    template <typename T>
-    struct TypeTraits<T, 8> 
-    {
-        typedef double type;
-        typedef double2 type2;
-        //typedef double3 type3;
-        //typedef double4 type3;
-    };
+    split_func(src, dst, stream);
+}

-    typedef void (*MergeFunction)(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream);
-    typedef void (*SplitFunction)(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream);
+} // namespace split_merge

-    //------------------------------------------------------------
-    // Merge    
-
-    template <typename T>
-    __global__ void mergeC2_(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        typedef typename TypeTraits<T>::type2 dst_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const T* src0_y = (const T*)(src0 + y * src0_step);
-        const T* src1_y = (const T*)(src1 + y * src1_step);
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_type dst_elem;
-            dst_elem.x = src0_y[x];
-            dst_elem.y = src1_y[x];
-            dst_y[x] = dst_elem;
-        }
-    }
-
-
-    template <typename T>
-    __global__ void mergeC3_(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        typedef typename TypeTraits<T>::type3 dst_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const T* src0_y = (const T*)(src0 + y * src0_step);
-        const T* src1_y = (const T*)(src1 + y * src1_step);
-        const T* src2_y = (const T*)(src2 + y * src2_step);
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_type dst_elem;
-            dst_elem.x = src0_y[x];
-            dst_elem.y = src1_y[x];
-            dst_elem.z = src2_y[x];
-            dst_y[x] = dst_elem;
-        }
-    }
-
-
-    template <>
-    __global__ void mergeC3_<double>(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double* src0_y = (const double*)(src0 + y * src0_step);
-        const double* src1_y = (const double*)(src1 + y * src1_step);
-        const double* src2_y = (const double*)(src2 + y * src2_step);
-        double* dst_y = (double*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_y[3 * x] = src0_y[x];
-            dst_y[3 * x + 1] = src1_y[x];
-            dst_y[3 * x + 2] = src2_y[x];
-        }
-    }
-
-
-    template <typename T>
-    __global__ void mergeC4_(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             const uchar* src3, size_t src3_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        typedef typename TypeTraits<T>::type4 dst_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const T* src0_y = (const T*)(src0 + y * src0_step);
-        const T* src1_y = (const T*)(src1 + y * src1_step);
-        const T* src2_y = (const T*)(src2 + y * src2_step);
-        const T* src3_y = (const T*)(src3 + y * src3_step);
-        dst_type* dst_y = (dst_type*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_type dst_elem;
-            dst_elem.x = src0_y[x];
-            dst_elem.y = src1_y[x];
-            dst_elem.z = src2_y[x];
-            dst_elem.w = src3_y[x];
-            dst_y[x] = dst_elem;
-        }
-    }
-
-
-    template <>
-    __global__ void mergeC4_<double>(const uchar* src0, size_t src0_step, 
-                             const uchar* src1, size_t src1_step, 
-                             const uchar* src2, size_t src2_step, 
-                             const uchar* src3, size_t src3_step, 
-                             int rows, int cols, uchar* dst, size_t dst_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double* src0_y = (const double*)(src0 + y * src0_step);
-        const double* src1_y = (const double*)(src1 + y * src1_step);
-        const double* src2_y = (const double*)(src2 + y * src2_step);
-        const double* src3_y = (const double*)(src3 + y * src3_step);
-        double2* dst_y = (double2*)(dst + y * dst_step);
-
-        if (x < cols && y < rows) 
-        {                        
-            dst_y[2 * x] = make_double2(src0_y[x], src1_y[x]);
-            dst_y[2 * x + 1] = make_double2(src2_y[x], src3_y[x]);
-        }
-    }
-
-
-    template <typename T>
-    static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-        mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
-                src[0].data, src[0].step,
-                src[1].data, src[1].step,
-                dst.rows, dst.cols, dst.data, dst.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-        mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
-                src[0].data, src[0].step,
-                src[1].data, src[1].step,
-                src[2].data, src[2].step,
-                dst.rows, dst.cols, dst.data, dst.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
-        mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
-                src[0].data, src[0].step,
-                src[1].data, src[1].step,
-                src[2].data, src[2].step,
-                src[3].data, src[3].step,
-                dst.rows, dst.cols, dst.data, dst.step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst,
-                                 int total_channels, size_t elem_size,
-                                 const cudaStream_t& stream)
-    {
-        static MergeFunction merge_func_tbl[] =
-        {
-            mergeC2_<char>, mergeC2_<short>, mergeC2_<int>, 0, mergeC2_<double>,
-            mergeC3_<char>, mergeC3_<short>, mergeC3_<int>, 0, mergeC3_<double>,
-            mergeC4_<char>, mergeC4_<short>, mergeC4_<int>, 0, mergeC4_<double>,
-        };
-
-        size_t merge_func_id = (total_channels - 2) * 5 + (elem_size >> 1);
-        MergeFunction merge_func = merge_func_tbl[merge_func_id];
-
-        if (merge_func == 0)
-            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
-
-        merge_func(src, dst, stream);
-    }
-
-
-
-    //------------------------------------------------------------
-    // Split
-
-
-    template <typename T>
-    __global__ void splitC2_(const uchar* src, size_t src_step, 
-                            int rows, int cols,
-                            uchar* dst0, size_t dst0_step,
-                            uchar* dst1, size_t dst1_step)
-    {
-        typedef typename TypeTraits<T>::type2 src_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const src_type* src_y = (const src_type*)(src + y * src_step);
-        T* dst0_y = (T*)(dst0 + y * dst0_step);
-        T* dst1_y = (T*)(dst1 + y * dst1_step);
-
-        if (x < cols && y < rows) 
-        {
-            src_type src_elem = src_y[x];
-            dst0_y[x] = src_elem.x;
-            dst1_y[x] = src_elem.y;
-        }
-    }
-
-
-    template <typename T>
-    __global__ void splitC3_(const uchar* src, size_t src_step, 
-                            int rows, int cols,
-                            uchar* dst0, size_t dst0_step,
-                            uchar* dst1, size_t dst1_step,
-                            uchar* dst2, size_t dst2_step)
-    {
-        typedef typename TypeTraits<T>::type3 src_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const src_type* src_y = (const src_type*)(src + y * src_step);
-        T* dst0_y = (T*)(dst0 + y * dst0_step);
-        T* dst1_y = (T*)(dst1 + y * dst1_step);
-        T* dst2_y = (T*)(dst2 + y * dst2_step);
-
-        if (x < cols && y < rows) 
-        {
-            src_type src_elem = src_y[x];
-            dst0_y[x] = src_elem.x;
-            dst1_y[x] = src_elem.y;
-            dst2_y[x] = src_elem.z;
-        }
-    }
-
-
-    template <>
-    __global__ void splitC3_<double>(
-            const uchar* src, size_t src_step, int rows, int cols,
-            uchar* dst0, size_t dst0_step,
-            uchar* dst1, size_t dst1_step,
-            uchar* dst2, size_t dst2_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double* src_y = (const double*)(src + y * src_step);
-        double* dst0_y = (double*)(dst0 + y * dst0_step);
-        double* dst1_y = (double*)(dst1 + y * dst1_step);
-        double* dst2_y = (double*)(dst2 + y * dst2_step);
-
-        if (x < cols && y < rows) 
-        {
-            dst0_y[x] = src_y[3 * x];
-            dst1_y[x] = src_y[3 * x + 1];
-            dst2_y[x] = src_y[3 * x + 2];
-        }
-    }
-
-
-    template <typename T>
-    __global__ void splitC4_(const uchar* src, size_t src_step, int rows, int cols,
-                            uchar* dst0, size_t dst0_step,
-                            uchar* dst1, size_t dst1_step,
-                            uchar* dst2, size_t dst2_step,
-                            uchar* dst3, size_t dst3_step)
-    {
-        typedef typename TypeTraits<T>::type4 src_type;
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const src_type* src_y = (const src_type*)(src + y * src_step);
-        T* dst0_y = (T*)(dst0 + y * dst0_step);
-        T* dst1_y = (T*)(dst1 + y * dst1_step);
-        T* dst2_y = (T*)(dst2 + y * dst2_step);
-        T* dst3_y = (T*)(dst3 + y * dst3_step);
-
-        if (x < cols && y < rows) 
-        {
-            src_type src_elem = src_y[x];
-            dst0_y[x] = src_elem.x;
-            dst1_y[x] = src_elem.y;
-            dst2_y[x] = src_elem.z;
-            dst3_y[x] = src_elem.w;
-        }
-    }
-
-
-    template <>
-    __global__ void splitC4_<double>(
-            const uchar* src, size_t src_step, int rows, int cols,
-            uchar* dst0, size_t dst0_step,
-            uchar* dst1, size_t dst1_step,
-            uchar* dst2, size_t dst2_step,
-            uchar* dst3, size_t dst3_step)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        const double2* src_y = (const double2*)(src + y * src_step);
-        double* dst0_y = (double*)(dst0 + y * dst0_step);
-        double* dst1_y = (double*)(dst1 + y * dst1_step);
-        double* dst2_y = (double*)(dst2 + y * dst2_step);
-        double* dst3_y = (double*)(dst3 + y * dst3_step);
-
-        if (x < cols && y < rows) 
-        {
-            double2 src_elem1 = src_y[2 * x];
-            double2 src_elem2 = src_y[2 * x + 1];
-            dst0_y[x] = src_elem1.x;
-            dst1_y[x] = src_elem1.y;
-            dst2_y[x] = src_elem2.x;
-            dst3_y[x] = src_elem2.y;
-        }
-    }
-
-    template <typename T>
-    static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-        splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
-                src.data, src.step, src.rows, src.cols,
-                dst[0].data, dst[0].step,
-                dst[1].data, dst[1].step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-        splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
-                src.data, src.step, src.rows, src.cols,
-                dst[0].data, dst[0].step,
-                dst[1].data, dst[1].step,
-                dst[2].data, dst[2].step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    template <typename T>
-    static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
-    {
-        dim3 blockDim(32, 8);
-        dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
-        splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
-                 src.data, src.step, src.rows, src.cols,
-                 dst[0].data, dst[0].step,
-                 dst[1].data, dst[1].step,
-                 dst[2].data, dst[2].step,
-                 dst[3].data, dst[3].step);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall(cudaDeviceSynchronize());
-    }
-
-
-    extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst,
-                                 int num_channels, size_t elem_size1,
-                                 const cudaStream_t& stream)
-    {
-        static SplitFunction split_func_tbl[] =
-        {
-            splitC2_<char>, splitC2_<short>, splitC2_<int>, 0, splitC2_<double>,
-            splitC3_<char>, splitC3_<short>, splitC3_<int>, 0, splitC3_<double>,
-            splitC4_<char>, splitC4_<short>, splitC4_<int>, 0, splitC4_<double>,
-        };
-
-        size_t split_func_id = (num_channels - 2) * 5 + (elem_size1 >> 1);
-        SplitFunction split_func = split_func_tbl[split_func_id];
-
-        if (split_func == 0)
-            cv::gpu::error("Unsupported channel count or data type", __FILE__, __LINE__);
-
-        split_func(src, dst, stream);
-    }
-
-}}} // namespace cv::gpu::split_merge
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@ -40,23 +40,18 @@
 //
 //M*/

-//#include "internal_shared.hpp"
-#include "opencv2/gpu/devmem2d.hpp"
-#include "safe_call.hpp"
-static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
+#include "internal_shared.hpp"

+BEGIN_OPENCV_DEVICE_NAMESPACE

-using namespace cv::gpu;
+namespace stereobm {

 //////////////////////////////////////////////////////////////////////////////////////////////////
-/////////////////////////////////////// Streeo BM ////////////////////////////////////////////////
+/////////////////////////////////////// Stereo BM ////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////////////////////////////////

 #define ROWSperTHREAD 21     // the number of rows a thread will process

-namespace cv { namespace gpu  { namespace bm
-{
-
 #define BLOCK_W 128          // the thread block width (464)
 #define N_DISPARITIES 8

@ -117,7 +112,7 @@ __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned
    __syncthreads();
    ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));

-    int mssd = min(min(min(ssd[0], ssd[1]), min(ssd[4], ssd[5])), min(min(ssd[2], ssd[3]), min(ssd[6], ssd[7])));
+    int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));

    int bestIdx = 0;
    for (int i = 0; i < N_DISPARITIES; i++)
@ -252,7 +247,7 @@ __global__ void stereoKernel(unsigned char *left, unsigned char *right, size_t i
        for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
            *ptr = 0xFFFFFFFF;
    }*/
-    int end_row = min(ROWSperTHREAD, cheight - Y - RADIUS);
+    int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
    int y_tex;
    int x_tex = X - RADIUS;

@ -346,7 +341,7 @@ const static kernel_caller_t callers[] =
 };
 const int calles_num = sizeof(callers)/sizeof(callers[0]);

-extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
+void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int maxdisp, int winsz, const DevMem2D_<unsigned int>& minSSD_buf, cudaStream_t& stream)
 {
    int winsz2 = winsz >> 1;

@ -375,7 +370,7 @@ extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, cons

 texture<unsigned char, 2, cudaReadModeElementType> texForSobel;

-extern "C" __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
+__global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
 {
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    int y = blockDim.y * blockIdx.y + threadIdx.y;
@ -387,12 +382,12 @@ extern "C" __global__ void prefilter_kernel(DevMem2Db output, int prefilterCap)
                   (int)tex2D(texForSobel, x - 1, y + 1) * (-1) + (int)tex2D(texForSobel, x + 1, y + 1) * (1);


-        conv = min(min(max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
+        conv = ::min(::min(::max(-prefilterCap, conv), prefilterCap) + prefilterCap, 255);
        output.ptr(y)[x] = conv & 0xFF;
    }
 }

-extern "C" void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
+void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap, cudaStream_t & stream)
 {
    cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
    cudaSafeCall( cudaBindTexture2D( 0, texForSobel, input.data, desc, input.cols, input.rows, input.step ) );
@ -451,7 +446,7 @@ __device__ float CalcSums(float *cols, float *cols_cache, int winsz)

 #define RpT (2 * ROWSperTHREAD)  // got experimentally

-extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
+__global__ void textureness_kernel(DevMem2Db disp, int winsz, float threshold)
 {
    int winsz2 = winsz/2;
    int n_dirty_pixels = (winsz2) * 2;
@ -462,7 +457,7 @@ extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float t

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int beg_row = blockIdx.y * RpT;
-    int end_row = min(beg_row + RpT, disp.rows);
+    int end_row = ::min(beg_row + RpT, disp.rows);

    if (x < disp.cols)
    {
@ -510,7 +505,7 @@ extern "C" __global__ void textureness_kernel(DevMem2Db disp, int winsz, float t
    }
 }

-extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
+void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream)
 {
    avgTexturenessThreshold *= winsz * winsz;

@ -537,4 +532,6 @@ extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float
    cudaSafeCall( cudaUnbindTexture (texForTF) );
 }

-}}}
+} // namespace stereobm
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@ -44,484 +44,489 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/limits.hpp"

-using namespace cv::gpu;
-using namespace cv::gpu::device;
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace stereobp {

-namespace cv { namespace gpu { namespace bp
-{
 ///////////////////////////////////////////////////////////////
 /////////////////////// load constants ////////////////////////
 ///////////////////////////////////////////////////////////////

-    __constant__ int   cndisp;
-    __constant__ float cmax_data_term;
-    __constant__ float cdata_weight;
-    __constant__ float cmax_disc_term;
-    __constant__ float cdisc_single_jump;
+__constant__ int   cndisp;
+__constant__ float cmax_data_term;
+__constant__ float cdata_weight;
+__constant__ float cmax_disc_term;
+__constant__ float cdisc_single_jump;

-    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
-    {
-        cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
-        cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
-        cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
-        cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
-        cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
-    }
+void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump)
+{
+    cudaSafeCall( cudaMemcpyToSymbol(cndisp,            &ndisp,            sizeof(int  )) );
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_data_term,    &max_data_term,    sizeof(float)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cdata_weight,      &data_weight,      sizeof(float)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cmax_disc_term,    &max_disc_term,    sizeof(float)) );
+    cudaSafeCall( cudaMemcpyToSymbol(cdisc_single_jump, &disc_single_jump, sizeof(float)) );
+}

 ///////////////////////////////////////////////////////////////
 ////////////////////////// comp data //////////////////////////
 ///////////////////////////////////////////////////////////////

-    template <int cn> struct PixDiff;
-    template <> struct PixDiff<1>
+template <int cn> struct PixDiff;
+template <> struct PixDiff<1>
+{
+    __device__ __forceinline__ PixDiff(const uchar* ls)
    {
-        __device__ __forceinline__ PixDiff(const uchar* ls)
-        {
-            l = *ls;
-        }
-        __device__ __forceinline__ float operator()(const uchar* rs) const
-        {
-            return abs((int)l - *rs);
-        }
-        uchar l;
-    };
-    template <> struct PixDiff<3>
+        l = *ls;
+    }
+    __device__ __forceinline__ float operator()(const uchar* rs) const
    {
-        __device__ __forceinline__ PixDiff(const uchar* ls)
-        {
-            l = *((uchar3*)ls);
-        }
-        __device__ __forceinline__ float operator()(const uchar* rs) const
-        {
-            const float tr = 0.299f;
-            const float tg = 0.587f;
-            const float tb = 0.114f;
-
-            float val  = tb * abs((int)l.x - rs[0]);
-                  val += tg * abs((int)l.y - rs[1]);
-                  val += tr * abs((int)l.z - rs[2]);
-
-            return val;
-        }
-        uchar3 l;
-    };
-    template <> struct PixDiff<4>
+        return ::abs((int)l - *rs);
+    }
+    uchar l;
+};
+template <> struct PixDiff<3>
+{
+    __device__ __forceinline__ PixDiff(const uchar* ls)
    {
-        __device__ __forceinline__ PixDiff(const uchar* ls)
-        {
-            l = *((uchar4*)ls);
-        }
-        __device__ __forceinline__ float operator()(const uchar* rs) const
-        {
-            const float tr = 0.299f;
-            const float tg = 0.587f;
-            const float tb = 0.114f;
-
-            uchar4 r = *((uchar4*)rs);
-
-            float val  = tb * abs((int)l.x - r.x);
-                  val += tg * abs((int)l.y - r.y);
-                  val += tr * abs((int)l.z - r.z);
-
-            return val;
-        }
-        uchar4 l;
-    };
-
-    template <int cn, typename D>
-    __global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
+        l = *((uchar3*)ls);
+    }
+    __device__ __forceinline__ float operator()(const uchar* rs) const
    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const float tr = 0.299f;
+        const float tg = 0.587f;
+        const float tb = 0.114f;

-        if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
+        float val  = tb * ::abs((int)l.x - rs[0]);
+              val += tg * ::abs((int)l.y - rs[1]);
+              val += tr * ::abs((int)l.z - rs[2]);
+
+        return val;
+    }
+    uchar3 l;
+};
+template <> struct PixDiff<4>
+{
+    __device__ __forceinline__ PixDiff(const uchar* ls)
+    {
+        l = *((uchar4*)ls);
+    }
+    __device__ __forceinline__ float operator()(const uchar* rs) const
+    {
+        const float tr = 0.299f;
+        const float tg = 0.587f;
+        const float tb = 0.114f;
+
+        uchar4 r = *((uchar4*)rs);
+
+        float val  = tb * ::abs((int)l.x - r.x);
+              val += tg * ::abs((int)l.y - r.y);
+              val += tr * ::abs((int)l.z - r.z);
+
+        return val;
+    }
+    uchar4 l;
+};
+
+template <int cn, typename D>
+__global__ void comp_data(const DevMem2Db left, const PtrStepb right, PtrElemStep_<D> data)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (y > 0 && y < left.rows - 1 && x > 0 && x < left.cols - 1)
+    {
+        const uchar* ls = left.ptr(y) + x * cn;
+        const PixDiff<cn> pixDiff(ls);
+        const uchar* rs = right.ptr(y) + x * cn;
+
+        D* ds = data.ptr(y) + x;
+        const size_t disp_step = data.step * left.rows;
+
+        for (int disp = 0; disp < cndisp; disp++)
        {
-            const uchar* ls = left.ptr(y) + x * cn;
-            const PixDiff<cn> pixDiff(ls);
-            const uchar* rs = right.ptr(y) + x * cn;
-
-            D* ds = data.ptr(y) + x;
-            const size_t disp_step = data.step * left.rows;
-
-            for (int disp = 0; disp < cndisp; disp++)
+            if (x - disp >= 1)
            {
-                if (x - disp >= 1)
-                {
-                    float val = pixDiff(rs - disp * cn);
+                float val = pixDiff(rs - disp * cn);

-                    ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
-                }
-                else
-                {
-                    ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
-                }
+                ds[disp * disp_step] = saturate_cast<D>(fmin(cdata_weight * val, cdata_weight * cmax_data_term));
+            }
+            else
+            {
+                ds[disp * disp_step] = saturate_cast<D>(cdata_weight * cmax_data_term);
            }
        }
    }
+}

-    template<typename T, typename D>
-    void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
+template<typename T, typename D>
+void comp_data_gpu(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);

-    template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+template <> void comp_data_gpu<uchar, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(left.cols, threads.x);
-        grid.y = divUp(left.rows, threads.y);
+    grid.x = divUp(left.cols, threads.x);
+    grid.y = divUp(left.rows, threads.y);

-        comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-        cudaSafeCall( cudaGetLastError() );
+    comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+template <> void comp_data_gpu<uchar, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(left.cols, threads.x);
-        grid.y = divUp(left.rows, threads.y);
+    grid.x = divUp(left.cols, threads.x);
+    grid.y = divUp(left.rows, threads.y);

-        comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-        cudaSafeCall( cudaGetLastError() );
+    comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

-    template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+template <> void comp_data_gpu<uchar3, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(left.cols, threads.x);
-        grid.y = divUp(left.rows, threads.y);
+    grid.x = divUp(left.cols, threads.x);
+    grid.y = divUp(left.rows, threads.y);

-        comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-        cudaSafeCall( cudaGetLastError() );
+    comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+template <> void comp_data_gpu<uchar3, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(left.cols, threads.x);
-        grid.y = divUp(left.rows, threads.y);
+    grid.x = divUp(left.cols, threads.x);
+    grid.y = divUp(left.rows, threads.y);

-        comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-        cudaSafeCall( cudaGetLastError() );
+    comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

-    template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+template <> void comp_data_gpu<uchar4, short>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(left.cols, threads.x);
-        grid.y = divUp(left.rows, threads.y);
+    grid.x = divUp(left.cols, threads.x);
+    grid.y = divUp(left.rows, threads.y);

-        comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
-        cudaSafeCall( cudaGetLastError() );
+    comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
-    template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+template <> void comp_data_gpu<uchar4, float>(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(left.cols, threads.x);
-        grid.y = divUp(left.rows, threads.y);
+    grid.x = divUp(left.cols, threads.x);
+    grid.y = divUp(left.rows, threads.y);

-        comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
-        cudaSafeCall( cudaGetLastError() );
+    comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

 ///////////////////////////////////////////////////////////////
 //////////////////////// data step down ///////////////////////
 ///////////////////////////////////////////////////////////////

-    template <typename T>
-    __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+template <typename T>
+__global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < dst_cols && y < dst_rows)
    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x < dst_cols && y < dst_rows)
+        for (int d = 0; d < cndisp; ++d)
        {
-            for (int d = 0; d < cndisp; ++d)
-            {
-                float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
-                      dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
-                      dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
-                      dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+            float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
+                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
+                  dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
+                  dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];

-                dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
-            }
+            dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
        }
    }
+}

-    template<typename T>
-    void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+template<typename T>
+void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(dst_cols, threads.x);
-        grid.y = divUp(dst_rows, threads.y);
+    grid.x = divUp(dst_cols, threads.x);
+    grid.y = divUp(dst_rows, threads.y);

-        data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
-        cudaSafeCall( cudaGetLastError() );
+    data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}

-    template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
-    template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);

 ///////////////////////////////////////////////////////////////
 /////////////////// level up messages  ////////////////////////
 ///////////////////////////////////////////////////////////////

-    template <typename T>
-    __global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
+template <typename T>
+__global__ void level_up_message(int dst_cols, int dst_rows, int src_rows, const PtrElemStep_<T> src, PtrElemStep_<T> dst)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x < dst_cols && y < dst_rows)
    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
+        const size_t dst_disp_step = dst.step * dst_rows;
+        const size_t src_disp_step = src.step * src_rows;

-        if (x < dst_cols && y < dst_rows)
-        {
-            const size_t dst_disp_step = dst.step * dst_rows;
-            const size_t src_disp_step = src.step * src_rows;
+        T*       dstr = dst.ptr(y  ) + x;
+        const T* srcr = src.ptr(y/2) + x/2;

-            T*       dstr = dst.ptr(y  ) + x;
-            const T* srcr = src.ptr(y/2) + x/2;
-
-            for (int d = 0; d < cndisp; ++d)
-                dstr[d * dst_disp_step] = srcr[d * src_disp_step];
-        }
+        for (int d = 0; d < cndisp; ++d)
+            dstr[d * dst_disp_step] = srcr[d * src_disp_step];
    }
+}

-    template <typename T>
-    void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+template <typename T>
+void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-        grid.x = divUp(dst_cols, threads.x);
-        grid.y = divUp(dst_rows, threads.y);
+    grid.x = divUp(dst_cols, threads.x);
+    grid.y = divUp(dst_rows, threads.y);

-        int src_idx = (dst_idx + 1) & 1;
+    int src_idx = (dst_idx + 1) & 1;

-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
-        cudaSafeCall( cudaGetLastError() );
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
-        cudaSafeCall( cudaGetLastError() );
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
-        cudaSafeCall( cudaGetLastError() );
-        level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
-        cudaSafeCall( cudaGetLastError() );
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
+    cudaSafeCall( cudaGetLastError() );

-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
+    cudaSafeCall( cudaGetLastError() );

-    template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
-    template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
+    cudaSafeCall( cudaGetLastError() );
+
+    level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+template void level_up_messages_gpu<short>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);
+template void level_up_messages_gpu<float>(int dst_idx, int dst_cols, int dst_rows, int src_rows, DevMem2Db* mus, DevMem2Db* mds, DevMem2Db* mls, DevMem2Db* mrs, cudaStream_t stream);

 ///////////////////////////////////////////////////////////////
 ////////////////////  calc all iterations /////////////////////
 ///////////////////////////////////////////////////////////////

-    template <typename T>
-    __device__ void calc_min_linear_penalty(T* dst, size_t step)
+template <typename T>
+__device__ void calc_min_linear_penalty(T* dst, size_t step)
+{
+    float prev = dst[0];
+    float cur;
+    for (int disp = 1; disp < cndisp; ++disp)
    {
-        float prev = dst[0];
-        float cur;
-        for (int disp = 1; disp < cndisp; ++disp)
+        prev += cdisc_single_jump;
+        cur = dst[step * disp];
+        if (prev < cur)
        {
-            prev += cdisc_single_jump;
-            cur = dst[step * disp];
-            if (prev < cur)
-            {
-                cur = prev;
-                dst[step * disp] = saturate_cast<T>(prev);
-            }
-            prev = cur;
-        }
-
-        prev = dst[(cndisp - 1) * step];
-        for (int disp = cndisp - 2; disp >= 0; disp--)
-        {
-            prev += cdisc_single_jump;
-            cur = dst[step * disp];
-            if (prev < cur)
-            {
-                cur = prev;
-                dst[step * disp] = saturate_cast<T>(prev);
-            }
-            prev = cur;
+            cur = prev;
+            dst[step * disp] = saturate_cast<T>(prev);
        }
+        prev = cur;
    }

-    template <typename T>
-    __device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
+    prev = dst[(cndisp - 1) * step];
+    for (int disp = cndisp - 2; disp >= 0; disp--)
    {
-        float minimum = numeric_limits<float>::max();
-
-        for(int i = 0; i < cndisp; ++i)
+        prev += cdisc_single_jump;
+        cur = dst[step * disp];
+        if (prev < cur)
        {
-            float dst_reg  = msg1[msg_disp_step * i];
-                  dst_reg += msg2[msg_disp_step * i];
-                  dst_reg += msg3[msg_disp_step * i];
-                  dst_reg += data[data_disp_step * i];
-
-            if (dst_reg < minimum)
-                minimum = dst_reg;
-
-            dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
+            cur = prev;
+            dst[step * disp] = saturate_cast<T>(prev);
        }
+        prev = cur;
+    }
+}

-        calc_min_linear_penalty(dst, msg_disp_step);
+template <typename T>
+__device__ void message(const T* msg1, const T* msg2, const T* msg3, const T* data, T* dst, size_t msg_disp_step, size_t data_disp_step)
+{
+    float minimum = device::numeric_limits<float>::max();

-        minimum += cmax_disc_term;
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg  = msg1[msg_disp_step * i];
+              dst_reg += msg2[msg_disp_step * i];
+              dst_reg += msg3[msg_disp_step * i];
+              dst_reg += data[data_disp_step * i];

-        float sum = 0;
-        for(int i = 0; i < cndisp; ++i)
-        {
-            float dst_reg = dst[msg_disp_step * i];
-            if (dst_reg > minimum)
-            {
-                dst_reg = minimum;
-                dst[msg_disp_step * i] = saturate_cast<T>(minimum);
-            }
-            sum += dst_reg;
-        }
-        sum /= cndisp;
+        if (dst_reg < minimum)
+            minimum = dst_reg;

-        for(int i = 0; i < cndisp; ++i)
-            dst[msg_disp_step * i] -= sum;
+        dst[msg_disp_step * i] = saturate_cast<T>(dst_reg);
    }

-    template <typename T>
-    __global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
-    {
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-        const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+    calc_min_linear_penalty(dst, msg_disp_step);

-        if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
+    minimum += cmax_disc_term;
+
+    float sum = 0;
+    for(int i = 0; i < cndisp; ++i)
+    {
+        float dst_reg = dst[msg_disp_step * i];
+        if (dst_reg > minimum)
        {
-            T* us = u.ptr(y) + x;
-            T* ds = d + y * u.step + x;
-            T* ls = l + y * u.step + x;
-            T* rs = r + y * u.step + x;
-            const T* dt = data.ptr(y) + x;
-
-            size_t msg_disp_step = u.step * rows;
-            size_t data_disp_step = data.step * rows;
-
-            message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
-            message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
-            message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
-            message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
+            dst_reg = minimum;
+            dst[msg_disp_step * i] = saturate_cast<T>(minimum);
        }
+        sum += dst_reg;
    }
+    sum /= cndisp;

-    template <typename T>
-    void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
-        const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
+    for(int i = 0; i < cndisp; ++i)
+        dst[msg_disp_step * i] -= sum;
+}
+
+template <typename T>
+__global__ void one_iteration(int t, PtrElemStep_<T> u, T* d, T* l, T* r, const PtrElemStep_<T> data, int cols, int rows)
+{
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
+
+    if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
+        T* us = u.ptr(y) + x;
+        T* ds = d + y * u.step + x;
+        T* ls = l + y * u.step + x;
+        T* rs = r + y * u.step + x;
+        const T* dt = data.ptr(y) + x;

-        grid.x = divUp(cols, threads.x << 1);
-        grid.y = divUp(rows, threads.y);
+        size_t msg_disp_step = u.step * rows;
+        size_t data_disp_step = data.step * rows;

-        for(int t = 0; t < iters; ++t)
-        {
-            one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
+        message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
+        message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
+        message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
    }
+}

-    template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
-    template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+template <typename T>
+void calc_all_iterations_gpu(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d,
+    const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);

-///////////////////////////////////////////////////////////////
-/////////////////////////// output ////////////////////////////
-///////////////////////////////////////////////////////////////
+    grid.x = divUp(cols, threads.x << 1);
+    grid.y = divUp(rows, threads.y);

-    template <typename T>
-    __global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
-        DevMem2D_<short> disp)
+    for(int t = 0; t < iters; ++t)
    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
-        {
-            const T* us = u.ptr(y + 1) + x;
-            const T* ds = d + (y - 1) * u.step + x;
-            const T* ls = l + y * u.step + (x + 1);
-            const T* rs = r + y * u.step + (x - 1);
-            const T* dt = data + y * u.step + x;
-
-            size_t disp_step = disp.rows * u.step;
-
-            int best = 0;
-            float best_val = numeric_limits<float>::max();
-            for (int d = 0; d < cndisp; ++d)
-            {
-                float val  = us[d * disp_step];
-                      val += ds[d * disp_step];
-                      val += ls[d * disp_step];
-                      val += rs[d * disp_step];
-                      val += dt[d * disp_step];
-
-                if (val < best_val)
-                {
-                    best_val = val;
-                    best = d;
-                }
-            }
-
-            disp.ptr(y)[x] = saturate_cast<short>(best);
-        }
-    }
-
-    template <typename T>
-    void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
-        const DevMem2D_<short>& disp, cudaStream_t stream)
-    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(disp.cols, threads.x);
-        grid.y = divUp(disp.rows, threads.y);
-
-        output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+        one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
+}

-    template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
-    template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
-}}}
+template void calc_all_iterations_gpu<short>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+template void calc_all_iterations_gpu<float>(int cols, int rows, int iters, const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, cudaStream_t stream);
+
+///////////////////////////////////////////////////////////////
+/////////////////////////// output ////////////////////////////
+///////////////////////////////////////////////////////////////
+
+template <typename T>
+__global__ void output(const PtrElemStep_<T> u, const T* d, const T* l, const T* r, const T* data,
+    DevMem2D_<short> disp)
+{
+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
+    {
+        const T* us = u.ptr(y + 1) + x;
+        const T* ds = d + (y - 1) * u.step + x;
+        const T* ls = l + y * u.step + (x + 1);
+        const T* rs = r + y * u.step + (x - 1);
+        const T* dt = data + y * u.step + x;
+
+        size_t disp_step = disp.rows * u.step;
+
+        int best = 0;
+        float best_val = numeric_limits<float>::max();
+        for (int d = 0; d < cndisp; ++d)
+        {
+            float val  = us[d * disp_step];
+                  val += ds[d * disp_step];
+                  val += ls[d * disp_step];
+                  val += rs[d * disp_step];
+                  val += dt[d * disp_step];
+
+            if (val < best_val)
+            {
+                best_val = val;
+                best = d;
+            }
+        }
+
+        disp.ptr(y)[x] = saturate_cast<short>(best);
+    }
+}
+
+template <typename T>
+void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data,
+    const DevMem2D_<short>& disp, cudaStream_t stream)
+{
+    dim3 threads(32, 8, 1);
+    dim3 grid(1, 1, 1);
+
+    grid.x = divUp(disp.cols, threads.x);
+    grid.y = divUp(disp.rows, threads.y);
+
+    output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+    cudaSafeCall( cudaGetLastError() );
+
+    if (stream == 0)
+        cudaSafeCall( cudaDeviceSynchronize() );
+}
+
+template void output_gpu<short>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+template void output_gpu<float>(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, const DevMem2D_<short>& disp, cudaStream_t stream);
+
+} // namespace stereobp
+
+END_OPENCV_DEVICE_NAMESPACE
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@ -71,16 +71,20 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }

 #include "opencv2/gpu/stream_accessor.hpp"

-namespace cv { namespace gpu { namespace device {            
-    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0);

-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-}}}
+template <typename T>
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
+template <typename T>
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+
+void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE;

 struct Stream::Impl
 {
@ -101,14 +105,14 @@ namespace
    void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        device::set_to_gpu(src, sf.val, src.channels(), stream);
+        set_to_gpu(src, sf.val, src.channels(), stream);
    }

    template <typename T>
    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
    {
        Scalar_<T> sf = s;
-        device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
+        set_to_gpu(src, sf.val, mask, src.channels(), stream);
    }
 }

@ -255,7 +259,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
        psrc = &(temp = src);

    dst.create( src.size(), rtype );
-    device::convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
+    convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream);
 }

 cv::gpu::Stream::operator bool() const
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@ -123,18 +123,19 @@ namespace
 ////////////////////////////////////////////////////////////////////////
 // add

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T, typename D> 
-    void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T, typename D> 
-    void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+template <typename T, typename D> 
+void add_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+template <typename T, typename D> 
+void add_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);

@ -173,7 +174,7 @@ void cv::gpu::add(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Gpu

 void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);

@ -235,18 +236,19 @@ void cv::gpu::add(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat
 ////////////////////////////////////////////////////////////////////////
 // subtract

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T, typename D> 
-    void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T, typename D> 
-    void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
-}}}
+template <typename T, typename D> 
+void subtract_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+template <typename T, typename D> 
+void subtract_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);

@ -285,7 +287,7 @@ void cv::gpu::subtract(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cons

 void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const GpuMat& mask, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, const PtrStepb& mask, cudaStream_t stream);

@ -347,21 +349,22 @@ void cv::gpu::subtract(const GpuMat& src, const Scalar& sc, GpuMat& dst, const G
 ////////////////////////////////////////////////////////////////////////
 // multiply

-namespace cv { namespace gpu { namespace device
-{
-    void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
-    void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T, typename D> 
-    void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+void multiply_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
+void multiply_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);

-    template <typename T, typename D> 
-    void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
-}}}
+template <typename T, typename D> 
+void multiply_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+template <typename T, typename D> 
+void multiply_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);

@ -419,7 +422,7 @@ void cv::gpu::multiply(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, doub

 void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);

@ -469,24 +472,25 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double
 ////////////////////////////////////////////////////////////////////////
 // divide

-namespace cv { namespace gpu { namespace device
-{
-    void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
-    void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T, typename D> 
-    void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);
+void divide_gpu(const DevMem2D_<uchar4>& src1, const DevMem2Df& src2, const DevMem2D_<uchar4>& dst, cudaStream_t stream);
+void divide_gpu(const DevMem2D_<short4>& src1, const DevMem2Df& src2, const DevMem2D_<short4>& dst, cudaStream_t stream);

-    template <typename T, typename D> 
-    void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+template <typename T, typename D> 
+void divide_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);

-    template <typename T, typename D> 
-    void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-}}}
+template <typename T, typename D> 
+void divide_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);
+
+template <typename T, typename D> 
+void divide_gpu(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, double scale, cudaStream_t stream);

@ -544,7 +548,7 @@ void cv::gpu::divide(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, double

 void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double scale, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, double scale, cudaStream_t stream);

@ -593,7 +597,7 @@ void cv::gpu::divide(const GpuMat& src, const Scalar& sc, GpuMat& dst, double sc

 void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(double scalar, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);

@ -626,18 +630,19 @@ void cv::gpu::divide(double scale, const GpuMat& src, GpuMat& dst, int dtype, St
 //////////////////////////////////////////////////////////////////////////////
 // absdiff

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T> 
-    void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T> 
-    void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
-}}}
+template <typename T>
+void absdiff_gpu(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+template <typename T> 
+void absdiff_gpu(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);

@ -709,7 +714,7 @@ void cv::gpu::absdiff(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Strea

 void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Stream& s)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, double val, const DevMem2Db& dst, cudaStream_t stream);

@ -753,17 +758,18 @@ void cv::gpu::absdiff(const GpuMat& src1, const Scalar& src2, GpuMat& dst, Strea
 //////////////////////////////////////////////////////////////////////////////
 // Comparison of two matrixes

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-    template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
-}}}
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename T> void compare_eq(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+template <typename T> void compare_ne(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+template <typename T> void compare_lt(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+template <typename T> void compare_le(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int cmpop, Stream& stream)
 {
-    using namespace cv::gpu::device;
+    using namespace OPENCV_DEVICE_NAMESPACE;

    typedef void (*func_t)(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);

@ -829,13 +835,14 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
 //////////////////////////////////////////////////////////////////////////////
 // Unary bitwise logical operations

-namespace cv { namespace gpu { namespace device
-{
-    void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T>
-    void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
+void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src, PtrStepb dst, cudaStream_t stream);
+
+template <typename T>
+void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
@ -843,20 +850,23 @@ namespace
    {
        dst.create(src.size(), src.type());

-        cv::gpu::device::bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), 
-                                              dst.channels(), src, dst, stream);
+        OPENCV_DEVICE_NAMESPACE_ bitwiseNotCaller(src.rows, src.cols, src.elemSize1(), dst.channels(), src, dst, stream);
    }


    void bitwiseNotCaller(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace cv::gpu;
+        using namespace OPENCV_DEVICE_NAMESPACE;

        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] = {device::bitwiseMaskNotCaller<unsigned char>, device::bitwiseMaskNotCaller<unsigned char>, 
-                                   device::bitwiseMaskNotCaller<unsigned short>, device::bitwiseMaskNotCaller<unsigned short>,
-                                   device::bitwiseMaskNotCaller<unsigned int>, device::bitwiseMaskNotCaller<unsigned int>,
-                                   device::bitwiseMaskNotCaller<unsigned int>};
+
+        static Caller callers[] = 
+        {
+            bitwiseMaskNotCaller<unsigned char>, bitwiseMaskNotCaller<unsigned char>, 
+            bitwiseMaskNotCaller<unsigned short>, bitwiseMaskNotCaller<unsigned short>,
+            bitwiseMaskNotCaller<unsigned int>, bitwiseMaskNotCaller<unsigned int>,
+            bitwiseMaskNotCaller<unsigned int>
+        };

        CV_Assert(mask.type() == CV_8U && mask.size() == src.size());
        dst.create(src.size(), src.type());
@ -874,33 +884,33 @@ namespace
 void cv::gpu::bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask, Stream& stream)
 {
    if (mask.empty())
-        ::bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
+        bitwiseNotCaller(src, dst, StreamAccessor::getStream(stream));
    else
-        ::bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
+        bitwiseNotCaller(src, dst, mask, StreamAccessor::getStream(stream));
 }


 //////////////////////////////////////////////////////////////////////////////
 // Binary bitwise logical operations

-namespace cv { namespace gpu { namespace device
-{
-    void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T>
-    void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);

-    void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+template <typename T>
+void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);

-    template <typename T>
-    void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);

-    void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);
+template <typename T>
+void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);

-    template <typename T>
-    void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
-}}}
+void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1, const PtrStepb src2, PtrStepb dst, cudaStream_t stream);

+template <typename T>
+void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2, const PtrStepb mask, PtrStepb dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
@ -909,20 +919,22 @@ namespace
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());

-        cv::gpu::device::bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), 
-                                             dst.channels(), src1, src2, dst, stream);
+        OPENCV_DEVICE_NAMESPACE_ bitwiseOrCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }

-
    void bitwiseOrCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace cv::gpu;
+        using namespace OPENCV_DEVICE_NAMESPACE;

        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] = {device::bitwiseMaskOrCaller<unsigned char>, device::bitwiseMaskOrCaller<unsigned char>, 
-                                   device::bitwiseMaskOrCaller<unsigned short>, device::bitwiseMaskOrCaller<unsigned short>,
-                                   device::bitwiseMaskOrCaller<unsigned int>, device::bitwiseMaskOrCaller<unsigned int>,
-                                   device::bitwiseMaskOrCaller<unsigned int>};
+
+        static Caller callers[] = 
+        {
+            bitwiseMaskOrCaller<unsigned char>, bitwiseMaskOrCaller<unsigned char>, 
+            bitwiseMaskOrCaller<unsigned short>, bitwiseMaskOrCaller<unsigned short>,
+            bitwiseMaskOrCaller<unsigned int>, bitwiseMaskOrCaller<unsigned int>,
+            bitwiseMaskOrCaller<unsigned int>
+        };

        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
@ -940,20 +952,23 @@ namespace
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());

-        cv::gpu::device::bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), 
-                                              dst.channels(), src1, src2, dst, stream);
+        OPENCV_DEVICE_NAMESPACE_ bitwiseAndCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }


    void bitwiseAndCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace cv::gpu;
+        using namespace OPENCV_DEVICE_NAMESPACE;

        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] = {device::bitwiseMaskAndCaller<unsigned char>, device::bitwiseMaskAndCaller<unsigned char>, 
-                                   device::bitwiseMaskAndCaller<unsigned short>, device::bitwiseMaskAndCaller<unsigned short>,
-                                   device::bitwiseMaskAndCaller<unsigned int>, device::bitwiseMaskAndCaller<unsigned int>,
-                                   device::bitwiseMaskAndCaller<unsigned int>};
+
+        static Caller callers[] = 
+        {
+            bitwiseMaskAndCaller<unsigned char>, bitwiseMaskAndCaller<unsigned char>, 
+            bitwiseMaskAndCaller<unsigned short>, bitwiseMaskAndCaller<unsigned short>,
+            bitwiseMaskAndCaller<unsigned int>, bitwiseMaskAndCaller<unsigned int>,
+            bitwiseMaskAndCaller<unsigned int>
+        };

        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
@ -971,20 +986,23 @@ namespace
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());

-        cv::gpu::device::bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), 
-                                              dst.channels(), src1, src2, dst, stream);
+        OPENCV_DEVICE_NAMESPACE_ bitwiseXorCaller(dst.rows, dst.cols, dst.elemSize1(), dst.channels(), src1, src2, dst, stream);
    }


    void bitwiseXorCaller(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
    {
-        using namespace cv::gpu;
+        using namespace OPENCV_DEVICE_NAMESPACE;

        typedef void (*Caller)(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
-        static Caller callers[] = {device::bitwiseMaskXorCaller<unsigned char>, device::bitwiseMaskXorCaller<unsigned char>, 
-                                   device::bitwiseMaskXorCaller<unsigned short>, device::bitwiseMaskXorCaller<unsigned short>,
-                                   device::bitwiseMaskXorCaller<unsigned int>, device::bitwiseMaskXorCaller<unsigned int>,
-                                   device::bitwiseMaskXorCaller<unsigned int>};
+
+        static Caller callers[] = 
+        {
+            bitwiseMaskXorCaller<unsigned char>, bitwiseMaskXorCaller<unsigned char>, 
+            bitwiseMaskXorCaller<unsigned short>, bitwiseMaskXorCaller<unsigned short>,
+            bitwiseMaskXorCaller<unsigned int>, bitwiseMaskXorCaller<unsigned int>,
+            bitwiseMaskXorCaller<unsigned int>
+        };

        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
@ -1001,47 +1019,48 @@ namespace
 void cv::gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
 {
    if (mask.empty())
-        ::bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+        bitwiseOrCaller(src1, src2, dst, StreamAccessor::getStream(stream));
    else
-        ::bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+        bitwiseOrCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
 }


 void cv::gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
 {
    if (mask.empty())
-        ::bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+        bitwiseAndCaller(src1, src2, dst, StreamAccessor::getStream(stream));
    else
-        ::bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+        bitwiseAndCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
 }


 void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask, Stream& stream)
 {
    if (mask.empty())
-        ::bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
+        bitwiseXorCaller(src1, src2, dst, StreamAccessor::getStream(stream));
    else
-        ::bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
+        bitwiseXorCaller(src1, src2, dst, mask, StreamAccessor::getStream(stream));
 }


 //////////////////////////////////////////////////////////////////////////////
 // Minimum and maximum operations

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+template <typename T>
+void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);

-    template <typename T>
-    void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+template <typename T>
+void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream);

-    template <typename T>
-    void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
-}}}
+template <typename T>
+void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+template <typename T>
+void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
@ -1050,14 +1069,14 @@ namespace
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        device::min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
    }

    template <typename T>
    void min_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
    {
        dst.create(src1.size(), src1.type());
-        device::min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        OPENCV_DEVICE_NAMESPACE_ min_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
    }
    
    template <typename T>
@ -1065,14 +1084,14 @@ namespace
    {
        CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
        dst.create(src1.size(), src1.type());
-        device::max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
+        OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), src2.reshape(1), dst.reshape(1), stream);
    }

    template <typename T>
    void max_caller(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream)
    {
        dst.create(src1.size(), src1.type());
-        device::max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
+        OPENCV_DEVICE_NAMESPACE_ max_gpu<T>(src1.reshape(1), saturate_cast<T>(src2), dst.reshape(1), stream);
    }
 }

@ -1136,18 +1155,18 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // threshold

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T>
-    void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type,
-        cudaStream_t stream);
-}}}
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename T>
+void threshold_gpu(const DevMem2Db& src, const DevMem2Db& dst, T thresh, T maxVal, int type, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    template <typename T> void threshold_caller(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, cudaStream_t stream)
    {
-        device::threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
+        OPENCV_DEVICE_NAMESPACE_ threshold_gpu<T>(src, dst, saturate_cast<T>(thresh), saturate_cast<T>(maxVal), type, stream);
    }
 }

@ -1204,24 +1223,27 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
 ////////////////////////////////////////////////////////////////////////
 // pow

-namespace cv { namespace gpu { namespace device
-{
-    template<typename T>
-    void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
-}}}
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template<typename T>
+void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 {
-    CV_Assert( src.depth() != CV_64F );
+    using namespace OPENCV_DEVICE_NAMESPACE;
+
+    CV_Assert(src.depth() != CV_64F);
    dst.create(src.size(), src.type());

    typedef void (*caller_t)(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);

    static const caller_t callers[] = 
    {
-        device::pow_caller<unsigned char>,  device::pow_caller<signed char>, 
-        device::pow_caller<unsigned short>, device::pow_caller<short>, 
-        device::pow_caller<int>, device::pow_caller<float>
+        pow_caller<unsigned char>,  pow_caller<signed char>, 
+        pow_caller<unsigned short>, pow_caller<short>, 
+        pow_caller<int>, pow_caller<float>
    };

    callers[src.depth()](src.reshape(1), (float)power, dst.reshape(1), StreamAccessor::getStream(stream));    
@ -1230,14 +1252,17 @@ void cv::gpu::pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream)
 ////////////////////////////////////////////////////////////////////////
 // addWeighted

-namespace cv { namespace gpu { namespace device
-{
-    template <typename T1, typename T2, typename D>
-    void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
-}}}
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename T1, typename T2, typename D>
+void addWeighted_gpu(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2, double beta, double gamma, GpuMat& dst, int dtype, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE;
+
    CV_Assert(src1.size() == src2.size());
    CV_Assert(src1.type() == src2.type() || (dtype >= 0 && src1.channels() == src2.channels()));

@ -1256,8 +1281,6 @@ void cv::gpu::addWeighted(const GpuMat& src1, double alpha, const GpuMat& src2,

    typedef void (*caller_t)(const DevMem2Db& src1, double alpha, const DevMem2Db& src2, double beta, double gamma, const DevMem2Db& dst, cudaStream_t stream);

-    using namespace cv::gpu::device;
-
    static const caller_t callers[7][7][7] =
    {
        {
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@ -735,14 +735,21 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter

-namespace cv { namespace gpu { namespace filters
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace row_filter
 {
    template <typename T, typename D>
    void linearRowFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
+}

+namespace column_filter
+{
    template <typename T, typename D>
    void linearColumnFilter_gpu(const DevMem2Db& src, const DevMem2Db& dst, const float kernel[], int ksize, int anchor, int brd_type, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
@ -796,6 +803,8 @@ namespace

 Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor, int borderType)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ row_filter;
+
    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
    
    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
@ -837,25 +846,25 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    switch (srcType)
    {
    case CV_8UC1:
-        func = filters::linearRowFilter_gpu<uchar, float>;
+        func = linearRowFilter_gpu<uchar, float>;
        break;
    case CV_8UC4:
-        func = filters::linearRowFilter_gpu<uchar4, float4>;
+        func = linearRowFilter_gpu<uchar4, float4>;
        break;
    /*case CV_16SC1:
-        func = filters::linearRowFilter_gpu<short, float>;
+        func = linearRowFilter_gpu<short, float>;
        break;*/
    /*case CV_16SC2:
-        func = filters::linearRowFilter_gpu<short2, float2>;
+        func = linearRowFilter_gpu<short2, float2>;
        break;*/
    case CV_16SC3:
-        func = filters::linearRowFilter_gpu<short3, float3>;
+        func = linearRowFilter_gpu<short3, float3>;
        break;
    case CV_32SC1:
-        func = filters::linearRowFilter_gpu<int, float>;
+        func = linearRowFilter_gpu<int, float>;
        break;
    case CV_32FC1:
-        func = filters::linearRowFilter_gpu<float, float>;
+        func = linearRowFilter_gpu<float, float>;
        break;
    }

@ -909,6 +918,8 @@ namespace

 Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor, int borderType)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ column_filter;
+
    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
    
    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
@ -950,25 +961,25 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    switch (dstType)
    {
    case CV_8UC1:
-        func = filters::linearColumnFilter_gpu<float, uchar>;
+        func = linearColumnFilter_gpu<float, uchar>;
        break;
    case CV_8UC4:
-        func = filters::linearColumnFilter_gpu<float4, uchar4>;
+        func = linearColumnFilter_gpu<float4, uchar4>;
        break;
    /*case CV_16SC1:
-        func = filters::linearColumnFilter_gpu<float, short>;
+        func = linearColumnFilter_gpu<float, short>;
        break;*/
    /*case CV_16SC2:
-        func = filters::linearColumnFilter_gpu<float2, short2>;
+        func = linearColumnFilter_gpu<float2, short2>;
        break;*/
    case CV_16SC3:
-        func = filters::linearColumnFilter_gpu<float3, short3>;
+        func = linearColumnFilter_gpu<float3, short3>;
        break;
    case CV_32SC1:
-        func = filters::linearColumnFilter_gpu<float, int>;
+        func = linearColumnFilter_gpu<float, int>;
        break;
    case CV_32FC1:
-        func = filters::linearColumnFilter_gpu<float, float>;
+        func = linearColumnFilter_gpu<float, float>;
        break;
    }

--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
@ -1,863 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-
-using namespace cv;
-using namespace cv::gpu;
-using namespace std;
-
-cv::gpu::GpuMat::GpuMat(const GpuMat& m) : 
-    flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend)
-{
-    if (refcount)
-        CV_XADD(refcount, 1);
-}
-
-cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) : 
-    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(rows_), cols(cols_), 
-    step(step_), data((uchar*)data_), refcount(0),
-    datastart((uchar*)data_), dataend((uchar*)data_)
-{
-    size_t minstep = cols * elemSize();
-
-    if (step == Mat::AUTO_STEP)
-    {
-        step = minstep;
-        flags |= Mat::CONTINUOUS_FLAG;
-    }
-    else
-    {
-        if (rows == 1) 
-            step = minstep;
-
-        CV_DbgAssert(step >= minstep);
-
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
-    }
-    dataend += step * (rows - 1) + minstep;
-}
-
-cv::gpu::GpuMat::GpuMat(Size size_, int type_, void* data_, size_t step_) : 
-    flags(Mat::MAGIC_VAL + (type_ & TYPE_MASK)), rows(size_.height), cols(size_.width),
-    step(step_), data((uchar*)data_), refcount(0),
-    datastart((uchar*)data_), dataend((uchar*)data_)
-{
-    size_t minstep = cols * elemSize();
-
-    if (step == Mat::AUTO_STEP)
-    {
-        step = minstep;
-        flags |= Mat::CONTINUOUS_FLAG;
-    }
-    else
-    {
-        if (rows == 1) 
-            step = minstep;
-
-        CV_DbgAssert(step >= minstep);
-
-        flags |= step == minstep ? Mat::CONTINUOUS_FLAG : 0;
-    }
-    dataend += step * (rows - 1) + minstep;
-}
-
-cv::gpu::GpuMat::GpuMat(const GpuMat& m, const Range& rowRange, const Range& colRange)
-{
-    flags = m.flags;
-    step = m.step; refcount = m.refcount;
-    data = m.data; datastart = m.datastart; dataend = m.dataend;
-
-    if (rowRange == Range::all())
-        rows = m.rows;
-    else
-    {
-        CV_Assert(0 <= rowRange.start && rowRange.start <= rowRange.end && rowRange.end <= m.rows);
-
-        rows = rowRange.size();
-        data += step*rowRange.start;
-    }
-
-    if (colRange == Range::all())
-        cols = m.cols;
-    else
-    {
-        CV_Assert(0 <= colRange.start && colRange.start <= colRange.end && colRange.end <= m.cols);
-
-        cols = colRange.size();
-        data += colRange.start*elemSize();
-        flags &= cols < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
-    }
-
-    if (rows == 1)
-        flags |= Mat::CONTINUOUS_FLAG;
-
-    if (refcount)
-        CV_XADD(refcount, 1);
-
-    if (rows <= 0 || cols <= 0)
-        rows = cols = 0;
-}
-
-cv::gpu::GpuMat::GpuMat(const GpuMat& m, const Rect& roi) : 
-    flags(m.flags), rows(roi.height), cols(roi.width),
-    step(m.step), data(m.data + roi.y*step), refcount(m.refcount),
-    datastart(m.datastart), dataend(m.dataend)
-{
-    flags &= roi.width < m.cols ? ~Mat::CONTINUOUS_FLAG : -1;
-    data += roi.x * elemSize();
-
-    CV_Assert(0 <= roi.x && 0 <= roi.width && roi.x + roi.width <= m.cols && 0 <= roi.y && 0 <= roi.height && roi.y + roi.height <= m.rows);
-
-    if (refcount)
-        CV_XADD(refcount, 1);
-
-    if (rows <= 0 || cols <= 0)
-        rows = cols = 0;
-}
-
-cv::gpu::GpuMat::GpuMat(const Mat& m) : 
-    flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0) 
-{ 
-    upload(m); 
-}
-
-GpuMat& cv::gpu::GpuMat::operator = (const GpuMat& m)
-{
-    if (this != &m)
-    {
-        GpuMat temp(m);
-        swap(temp);
-    }
-
-    return *this;
-}
-
-void cv::gpu::GpuMat::swap(GpuMat& b)
-{
-    std::swap(flags, b.flags);
-    std::swap(rows, b.rows); 
-    std::swap(cols, b.cols);
-    std::swap(step, b.step); 
-    std::swap(data, b.data);
-    std::swap(datastart, b.datastart);
-    std::swap(dataend, b.dataend);
-    std::swap(refcount, b.refcount);
-}
-
-void cv::gpu::GpuMat::locateROI(Size& wholeSize, Point& ofs) const
-{
-    size_t esz = elemSize();
-    ptrdiff_t delta1 = data - datastart;
-    ptrdiff_t delta2 = dataend - datastart;
-
-    CV_DbgAssert(step > 0);
-
-    if (delta1 == 0)
-        ofs.x = ofs.y = 0;
-    else
-    {
-        ofs.y = static_cast<int>(delta1 / step);
-        ofs.x = static_cast<int>((delta1 - step * ofs.y) / esz);
-
-        CV_DbgAssert(data == datastart + ofs.y * step + ofs.x * esz);
-    }
-
-    size_t minstep = (ofs.x + cols) * esz;
-
-    wholeSize.height = std::max(static_cast<int>((delta2 - minstep) / step + 1), ofs.y + rows);
-    wholeSize.width = std::max(static_cast<int>((delta2 - step * (wholeSize.height - 1)) / esz), ofs.x + cols);
-}
-
-GpuMat& cv::gpu::GpuMat::adjustROI(int dtop, int dbottom, int dleft, int dright)
-{
-    Size wholeSize; 
-    Point ofs;
-    locateROI(wholeSize, ofs);
-
-    size_t esz = elemSize();
-
-    int row1 = std::max(ofs.y - dtop, 0); 
-    int row2 = std::min(ofs.y + rows + dbottom, wholeSize.height);
-
-    int col1 = std::max(ofs.x - dleft, 0);
-    int col2 = std::min(ofs.x + cols + dright, wholeSize.width);
-
-    data += (row1 - ofs.y) * step + (col1 - ofs.x) * esz;
-    rows = row2 - row1; 
-    cols = col2 - col1;
-
-    if (esz * cols == step || rows == 1)
-        flags |= Mat::CONTINUOUS_FLAG;
-    else
-        flags &= ~Mat::CONTINUOUS_FLAG;
-
-    return *this;
-}
-
-GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
-{
-    GpuMat hdr = *this;
-
-    int cn = channels();
-    if (new_cn == 0)
-        new_cn = cn;
-
-    int total_width = cols * cn;
-
-    if ((new_cn > total_width || total_width % new_cn != 0) && new_rows == 0)
-        new_rows = rows * total_width / new_cn;
-
-    if (new_rows != 0 && new_rows != rows)
-    {
-        int total_size = total_width * rows;
-
-        if (!isContinuous())
-            CV_Error(CV_BadStep, "The matrix is not continuous, thus its number of rows can not be changed");
-
-        if ((unsigned)new_rows > (unsigned)total_size)
-            CV_Error(CV_StsOutOfRange, "Bad new number of rows");
-
-        total_width = total_size / new_rows;
-
-        if (total_width * new_rows != total_size)
-            CV_Error(CV_StsBadArg, "The total number of matrix elements is not divisible by the new number of rows");
-
-        hdr.rows = new_rows;
-        hdr.step = total_width * elemSize1();
-    }
-
-    int new_width = total_width / new_cn;
-
-    if (new_width * new_cn != total_width)
-        CV_Error(CV_BadNumChannels, "The total width is not divisible by the new number of channels");
-
-    hdr.cols = new_width;
-    hdr.flags = (hdr.flags & ~CV_MAT_CN_MASK) | ((new_cn - 1) << CV_CN_SHIFT);
-
-    return hdr;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-class GpuFuncTable
-{
-public:
-    virtual ~GpuFuncTable() {}
-
-    virtual void copy(const Mat& src, GpuMat& dst) const = 0;
-    virtual void copy(const GpuMat& src, Mat& dst) const = 0;
-    virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
-
-    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
-
-    virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
-    virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
-
-    virtual void setTo(GpuMat& m, const Scalar& s, const GpuMat& mask) const = 0;
-
-    virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
-    virtual void free(void* devPtr) const = 0;
-};
-
-
-#if !defined (HAVE_CUDA)
-
-class EmptyFuncTable : public GpuFuncTable
-{
-public:
-    void copy(const Mat&, GpuMat&) const { throw_nogpu(); }
-    void copy(const GpuMat&, Mat&) const { throw_nogpu(); }
-    void copy(const GpuMat&, GpuMat&) const { throw_nogpu(); }
-
-    void copyWithMask(const GpuMat&, GpuMat&, const GpuMat&) const { throw_nogpu(); }
-
-    void convert(const GpuMat&, GpuMat&) const { throw_nogpu(); }
-    void convert(const GpuMat&, GpuMat&, double, double) const { throw_nogpu(); }
-
-    void setTo(GpuMat&, const Scalar&, const GpuMat&) const { throw_nogpu(); }
-
-    void mallocPitch(void**, size_t*, size_t, size_t) const { throw_nogpu(); }
-    void free(void*) const {}
-};
-
-const GpuFuncTable* gpuFuncTable()
-{
-    static EmptyFuncTable empty;
-    return &empty;
-}
-
-#else
-
-namespace cv { namespace gpu { namespace device
-{
-    void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
-
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
-    template <typename T>
-    void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
-
-    void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
-}}}
-
-namespace
-{
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int n> struct NPPTypeTraits;
-    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
-    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
-    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
-    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
-    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void cvt(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void cvt(const GpuMat& src, GpuMat& dst)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
-    {
-        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-    
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, const Scalar& s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, const Scalar& s)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T>
-    void kernelSet(GpuMat& src, const Scalar& s)
-    {
-        Scalar_<T> sf = s;
-        device::set_to_gpu(src, sf.val, src.channels(), 0);
-    }
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void set(GpuMat& src, const Scalar& s, const GpuMat& mask)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template <typename T>
-    void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask)
-    {
-        Scalar_<T> sf = s;
-        device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
-    }
-}
-
-class CudaFuncTable : public GpuFuncTable
-{
-public:
-    virtual void copy(const Mat& src, GpuMat& dst) const 
-    { 
-        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
-    }
-    virtual void copy(const GpuMat& src, Mat& dst) const
-    { 
-        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
-    }
-    virtual void copy(const GpuMat& src, GpuMat& dst) const
-    { 
-        cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
-    }
-
-    virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const 
-    { 
-        device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
-    }
-
-    void convert(const GpuMat& src, GpuMat& dst) const 
-    { 
-        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
-        static const caller_t callers[7][7][7] =
-        {
-            {                
-                /*  8U ->  8U */ {0, 0, 0, 0},
-                /*  8U ->  8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
-                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
-                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
-                /*  8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-            },
-            {
-                /*  8S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8S ->  8S */ {0,0,0,0},
-                /*  8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /*  8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-            },
-            {
-                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
-                /* 16U ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16U -> 16U */ {0,0,0,0},
-                /* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-            },
-            {
-                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
-                /* 16S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16S -> 16S */ {0,0,0,0},
-                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-            },
-            {
-                /* 32S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32S -> 32S */ {0,0,0,0},
-                /* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-            },
-            {
-                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 32F -> 32F */ {0,0,0,0},
-                /* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
-            },
-            {
-                /* 64F ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 64F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
-                /* 64F -> 64F */ {0,0,0,0}
-            }
-        };
-
-        caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
-        CV_DbgAssert(func != 0);
-
-        func(src, dst);
-    }
-    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const 
-    { 
-        device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
-    }
-
-    void setTo(GpuMat& m, const Scalar& s, const GpuMat& mask) const
-    {
-        NppiSize sz;
-        sz.width  = m.cols;
-        sz.height = m.rows;
-
-        if (mask.empty())
-        {
-            if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-            {
-                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
-                return;
-            }
-
-            if (m.depth() == CV_8U)
-            {
-                int cn = m.channels();
-
-                if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-                {
-                    int val = saturate_cast<gpu::uchar>(s[0]);
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                    return;
-                }
-            }
-
-            typedef void (*caller_t)(GpuMat& src, const Scalar& s);
-            static const caller_t callers[7][4] =
-            {
-                {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<gpu::uchar>,kernelSet<gpu::uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
-                {kernelSet<gpu::schar>,kernelSet<gpu::schar>,kernelSet<gpu::schar>,kernelSet<gpu::schar>},
-                {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<gpu::ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
-                {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
-                {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
-                {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
-                {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}
-            };
-
-            callers[m.depth()][m.channels() - 1](m, s);
-        }
-        else
-        {
-            typedef void (*caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask);
-
-            static const caller_t callers[7][4] =
-            {
-                {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<gpu::uchar>,kernelSetMask<gpu::uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
-                {kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>,kernelSetMask<gpu::schar>},
-                {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<gpu::ushort>,kernelSetMask<gpu::ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
-                {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
-                {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
-                {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
-                {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}
-            };
-
-            callers[m.depth()][m.channels() - 1](m, s, mask);
-        }
-    }
-
-    void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
-    {
-        cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
-    }
-
-    void free(void* devPtr) const
-    {
-        cudaFree(devPtr);
-    }
-};
-
-const GpuFuncTable* gpuFuncTable()
-{
-    static CudaFuncTable cuda;
-    return &cuda;
-}
-
-#endif
-
-void cv::gpu::GpuMat::upload(const Mat& m)
-{
-    CV_DbgAssert(!m.empty());
-
-    create(m.size(), m.type());
-
-    gpuFuncTable()->copy(m, *this);
-}
-
-void cv::gpu::GpuMat::download(cv::Mat& m) const
-{
-    CV_DbgAssert(!empty());
-
-    m.create(size(), type());
-
-    gpuFuncTable()->copy(*this, m);
-}
-
-void cv::gpu::GpuMat::copyTo(GpuMat& m) const
-{
-    CV_DbgAssert(!empty());
-
-    m.create(size(), type());
-
-    gpuFuncTable()->copy(*this, m);
-}
-
-void cv::gpu::GpuMat::copyTo(GpuMat& mat, const GpuMat& mask) const
-{
-    if (mask.empty())
-        copyTo(mat);
-    else
-    {
-        mat.create(size(), type());
-
-        gpuFuncTable()->copyWithMask(*this, mat, mask);
-    }
-}
-
-void cv::gpu::GpuMat::convertTo(GpuMat& dst, int rtype, double alpha, double beta) const
-{
-    bool noScale = fabs(alpha - 1) < numeric_limits<double>::epsilon() && fabs(beta) < numeric_limits<double>::epsilon();
-
-    if (rtype < 0)
-        rtype = type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
-
-    int scn = channels();
-    int sdepth = depth();
-    int ddepth = CV_MAT_DEPTH(rtype);
-    if (sdepth == ddepth && noScale)
-    {
-        copyTo(dst);
-        return;
-    }
-
-    GpuMat temp;
-    const GpuMat* psrc = this;
-    if (sdepth != ddepth && psrc == &dst)
-    {
-        temp = *this;
-        psrc = &temp;
-    }
-
-    dst.create(size(), rtype);
-
-    if (noScale)
-        gpuFuncTable()->convert(*psrc, dst);
-    else
-        gpuFuncTable()->convert(*psrc, dst, alpha, beta);
-}
-
-GpuMat& cv::gpu::GpuMat::setTo(const Scalar& s, const GpuMat& mask)
-{
-    CV_Assert(mask.empty() || mask.type() == CV_8UC1);
-    CV_DbgAssert(!empty());
-
-    gpuFuncTable()->setTo(*this, s, mask);    
-
-    return *this;
-}
-
-void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
-{
-    _type &= TYPE_MASK;
-
-    if (rows == _rows && cols == _cols && type() == _type && data)
-        return;
-
-    if (data)
-        release();
-
-    CV_DbgAssert(_rows >= 0 && _cols >= 0);
-
-    if (_rows > 0 && _cols > 0)
-    {
-        flags = Mat::MAGIC_VAL + _type;
-        rows = _rows;
-        cols = _cols;
-
-        size_t esz = elemSize();
-
-        void* devPtr;
-        gpuFuncTable()->mallocPitch(&devPtr, &step, esz * cols, rows);
-
-        // Single row must be continuous
-        if (rows == 1)
-            step = esz * cols;
-
-        if (esz * cols == step)
-            flags |= Mat::CONTINUOUS_FLAG;
-
-        int64 _nettosize = static_cast<int64>(step) * rows;
-        size_t nettosize = static_cast<size_t>(_nettosize);
-
-        datastart = data = static_cast<uchar*>(devPtr);
-        dataend = data + nettosize;
-
-        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
-        *refcount = 1;
-    }
-}
-
-void cv::gpu::GpuMat::release()
-{
-    if (refcount && CV_XADD(refcount, -1) == 1)
-    {
-        fastFree(refcount);
-
-        gpuFuncTable()->free(datastart);
-    }
-
-    data = datastart = dataend = 0;
-    step = rows = cols = 0;
-    refcount = 0;
-}
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@ -60,40 +60,44 @@ std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector64x128() { throw_nog

 #else

-namespace cv { namespace gpu { namespace hog {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
-                      int nblocks_win_x, int nblocks_win_y);
+namespace hog 
+{
+    void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
+                          int nblocks_win_x, int nblocks_win_y);

-void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
-                   int height, int width, const cv::gpu::DevMem2Df& grad, 
-                   const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);
+    void compute_hists(int nbins, int block_stride_x, int blovck_stride_y,
+                       int height, int width, const cv::gpu::DevMem2Df& grad, 
+                       const cv::gpu::DevMem2Db& qangle, float sigma, float* block_hists);

-void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
-                     int height, int width, float* block_hists, float threshold);
+    void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
+                         int height, int width, float* block_hists, float threshold);

-void classify_hists(int win_height, int win_width, int block_stride_y, 
-                    int block_stride_x, int win_stride_y, int win_stride_x, int height, 
-                    int width, float* block_hists, float* coefs, float free_coef, 
-                    float threshold, unsigned char* labels);
+    void classify_hists(int win_height, int win_width, int block_stride_y, 
+                        int block_stride_x, int win_stride_y, int win_stride_x, int height, 
+                        int width, float* block_hists, float* coefs, float free_coef, 
+                        float threshold, unsigned char* labels);

-void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                            cv::gpu::DevMem2Df descriptors);
-void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                            cv::gpu::DevMem2Df descriptors);
+    void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                                cv::gpu::DevMem2Df descriptors);
+    void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                                int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                                cv::gpu::DevMem2Df descriptors);

-void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
-                            float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
-void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
-                            float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
+    void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
+                                float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);
+    void compute_gradients_8UC4(int nbins, int height, int width, const cv::gpu::DevMem2Db& img, 
+                                float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2Db qangle, bool correct_gamma);

-void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
-void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
+    void resize_8UC1(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
+    void resize_8UC4(const cv::gpu::DevMem2Db& src, cv::gpu::DevMem2Db dst);
+}

-}}}
+END_OPENCV_DEVICE_NAMESPACE

+using namespace OPENCV_DEVICE_NAMESPACE;
    
 cv::gpu::HOGDescriptor::HOGDescriptor(Size win_size, Size block_size, Size block_stride, Size cell_size, 
 									  int nbins, double win_sigma, double threshold_L2hys, bool gamma_correction, int nlevels)
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@ -107,15 +107,20 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
 ////////////////////////////////////////////////////////////////////////
 // remap

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace remap 
 {
-    template <typename T> void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, 
-                                         int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
-}}}
+    template <typename T> 
+    void remap_gpu(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, 
+                   int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, int cc);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue, Stream& stream)
 {
-    using namespace cv::gpu::imgproc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ remap;

    typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2Db& dst, int interpolation, 
        int borderMode, const float* borderValue, cudaStream_t stream, int cc);
@ -155,13 +160,19 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
 ////////////////////////////////////////////////////////////////////////
 // meanShiftFiltering_GPU

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void meanShiftFiltering_gpu(const DevMem2Db& src, DevMem2Db dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );

@ -180,19 +191,25 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);

-    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+    meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
 // meanShiftProc_GPU

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    if( src.empty() )
        CV_Error( CV_StsBadArg, "The input image is empty" );

@ -212,26 +229,32 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);

-    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
+    meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
 // drawColorDisp

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void drawColorDisp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
    void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    template <typename T>
    void drawColorDisp_caller(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
        dst.create(src.size(), CV_8UC4);

-        imgproc::drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);
+        drawColorDisp_gpu((DevMem2D_<T>)src, dst, ndisp, stream);
    }

    typedef void (*drawColorDisp_caller_t)(const GpuMat& src, GpuMat& dst, int ndisp, const cudaStream_t& stream);
@ -249,19 +272,26 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
 ////////////////////////////////////////////////////////////////////////
 // reprojectImageTo3D

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void reprojectImageTo3D_gpu(const DevMem2Db& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
    void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    template <typename T>
    void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
        xyzw.create(disp.rows, disp.cols, CV_32FC4);
-        imgproc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
+
+        reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
    }

    typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);
@ -279,10 +309,14 @@ void cv::gpu::reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat& Q,
 ////////////////////////////////////////////////////////////////////////
 // resize

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace resize 
 {
    template <typename T> void resize_gpu(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
@ -346,7 +380,7 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
    }
    else
    {
-        using namespace cv::gpu::imgproc;
+        using namespace OPENCV_DEVICE_NAMESPACE_ resize;

        typedef void (*caller_t)(const DevMem2Db& src, float fx, float fy, const DevMem2Db& dst, int interpolation, cudaStream_t stream);
        static const caller_t callers[6][4] = 
@ -366,18 +400,24 @@ void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, doub
 ////////////////////////////////////////////////////////////////////////
 // copyMakeBorder

-namespace cv { namespace gpu {  namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace copy_make_border 
 {
    template <typename T, int cn> void copyMakeBorder_gpu(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderMode, const T* borderValue, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    template <typename T, int cn> void copyMakeBorder_caller(const DevMem2Db& src, const DevMem2Db& dst, int top, int left, int borderType, const Scalar& value, cudaStream_t stream)
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ copy_make_border;
+
        Scalar_<T> val(saturate_cast<T>(value[0]), saturate_cast<T>(value[1]), saturate_cast<T>(value[2]), saturate_cast<T>(value[3]));

-        imgproc::copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
+        copyMakeBorder_gpu<T, cn>(src, dst, top, left, borderType, val.val, stream);
    }
 }

@ -626,16 +666,22 @@ void cv::gpu::warpPerspective(const GpuMat& src, GpuMat& dst, const Mat& M, Size
 //////////////////////////////////////////////////////////////////////////////
 // buildWarpPlaneMaps

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                            const float k_rinv[9], const float r_kinv[9], const float t[3], float scale,
                            cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, const Mat &T, 
                                 float scale, GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);
    CV_Assert((T.size() == Size(3,1) || T.size() == Size(1,3)) && T.type() == CV_32F && T.isContinuous());
@ -647,23 +693,29 @@ void cv::gpu::buildWarpPlaneMaps(Size src_size, Rect dst_roi, const Mat &K, cons

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
-    imgproc::buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), 
-                                T.ptr<float>(), scale, StreamAccessor::getStream(stream));
+    buildWarpPlaneMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), 
+                       T.ptr<float>(), scale, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
 // buildWarpCylyndricalMaps

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void buildWarpCylindricalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                  const float k_rinv[9], const float r_kinv[9], float scale,
                                  cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
                                       GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);

@ -674,24 +726,29 @@ void cv::gpu::buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat &K

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
-    imgproc::buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
-                                      scale, StreamAccessor::getStream(stream));
+    buildWarpCylindricalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
 }


 //////////////////////////////////////////////////////////////////////////////
 // buildWarpSphericalMaps

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void buildWarpSphericalMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
                                const float k_rinv[9], const float r_kinv[9], float scale,
                                cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
                                     GpuMat& map_x, GpuMat& map_y, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    CV_Assert(K.size() == Size(3,3) && K.type() == CV_32F);
    CV_Assert(R.size() == Size(3,3) && R.type() == CV_32F);

@ -702,8 +759,7 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K,

    map_x.create(dst_roi.size(), CV_32F);
    map_y.create(dst_roi.size(), CV_32F);
-    imgproc::buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(),
-                                    scale, StreamAccessor::getStream(stream));
+    buildWarpSphericalMaps(dst_roi.tl().x, dst_roi.tl().y, map_x, map_y, K_Rinv.ptr<float>(), R_Kinv.ptr<float>(), scale, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -843,17 +899,24 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum, Stream& s)
 //////////////////////////////////////////////////////////////////////////////
 // columnSum

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc
 {
    void columnSum_32F(const DevMem2Db src, const DevMem2Db dst);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    CV_Assert(src.type() == CV_32F);

    dst.create(src.size(), CV_32F);
-    imgproc::columnSum_32F(src, dst);
+
+    columnSum_32F(src, dst);
 }

 void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
@ -1140,7 +1203,6 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, S
    histRange(src, hist, levels, buf, stream);
 }

-
 void cv::gpu::histRange(const GpuMat& src, GpuMat& hist, const GpuMat& levels, GpuMat& buf, Stream& stream)
 {
    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_16UC1 || src.type() == CV_16SC1 || src.type() == CV_32FC1);
@ -1183,13 +1245,19 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
    hist_callers[src.depth()](src, hist, levels, buf, StreamAccessor::getStream(stream));
 }

-namespace cv { namespace gpu { namespace histograms
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace hist
 {
    void histogram256_gpu(DevMem2Db src, int* hist, unsigned int* buf, cudaStream_t stream);

    const int PARTIAL_HISTOGRAM256_COUNT = 240;
    const int HISTOGRAM256_BIN_COUNT     = 256;
-}}}
+
+    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
 {
@ -1199,7 +1267,7 @@ void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)

 void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
 {
-    using namespace cv::gpu::histograms;
+    using namespace OPENCV_DEVICE_NAMESPACE_ hist;

    CV_Assert(src.type() == CV_8UC1);

@ -1223,14 +1291,9 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, Stream&
    equalizeHist(src, dst, hist, buf, stream);
 }

-namespace cv { namespace gpu { namespace histograms
-{
-    void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream);
-}}}
-
 void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat& buf, Stream& s)
 {
-    using namespace cv::gpu::histograms;
+    using namespace OPENCV_DEVICE_NAMESPACE_ hist;

    CV_Assert(src.type() == CV_8UC1);

@ -1264,13 +1327,16 @@ void cv::gpu::equalizeHist(const GpuMat& src, GpuMat& dst, GpuMat& hist, GpuMat&
 ////////////////////////////////////////////////////////////////////////
 // cornerHarris & minEgenVal

-namespace cv { namespace gpu { namespace imgproc {
+BEGIN_OPENCV_DEVICE_NAMESPACE

+namespace imgproc 
+{
    void extractCovData_caller(const DevMem2Df Dx, const DevMem2Df Dy, PtrStepf dst, cudaStream_t stream);
    void cornerHarris_caller(const int block_size, const float k, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
    void cornerMinEigenVal_caller(const int block_size, const DevMem2Db Dx, const DevMem2Db Dy, DevMem2Db dst, int border_type, cudaStream_t stream);
+}

-}}}
+END_OPENCV_DEVICE_NAMESPACE

 namespace 
 {
@ -1316,7 +1382,6 @@ namespace

 } // Anonymous namespace

-
 bool cv::gpu::tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType)
 {
    switch (cpuBorderType)
@ -1356,6 +1421,8 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D

 void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, double k, int borderType, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    CV_Assert(borderType == cv::BORDER_REFLECT101 ||
              borderType == cv::BORDER_REPLICATE);

@ -1364,7 +1431,7 @@ void cv::gpu::cornerHarris(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& D

    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);
    dst.create(src.size(), CV_32F);
-    imgproc::cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
+    cornerHarris_caller(blockSize, (float)k, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }

 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, int blockSize, int ksize, int borderType)
@ -1381,6 +1448,8 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM

 void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuMat& Dy, GpuMat& buf, int blockSize, int ksize, int borderType, Stream& stream)
 {  
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    CV_Assert(borderType == cv::BORDER_REFLECT101 ||
              borderType == cv::BORDER_REPLICATE);

@ -1389,24 +1458,30 @@ void cv::gpu::cornerMinEigenVal(const GpuMat& src, GpuMat& dst, GpuMat& Dx, GpuM

    extractCovData(src, Dx, Dy, buf, blockSize, ksize, borderType, stream);    
    dst.create(src.size(), CV_32F);
-    imgproc::cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
+    cornerMinEigenVal_caller(blockSize, Dx, Dy, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }

 //////////////////////////////////////////////////////////////////////////////
 // mulSpectrums

-namespace cv { namespace gpu { namespace imgproc 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void mulSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);

    void mulSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c, cudaStream_t stream);
-}}}
+}

+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, bool conjB, Stream& stream) 
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, DevMem2D_<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { imgproc::mulSpectrums, imgproc::mulSpectrums_CONJ };
+
+    static Caller callers[] = { mulSpectrums, mulSpectrums_CONJ };

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
    CV_Assert(a.size() == b.size());
@ -1420,18 +1495,23 @@ void cv::gpu::mulSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flag
 //////////////////////////////////////////////////////////////////////////////
 // mulAndScaleSpectrums

-namespace cv { namespace gpu { namespace imgproc 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc 
 {
    void mulAndScaleSpectrums(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);

    void mulAndScaleSpectrums_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, float scale, DevMem2D_<cufftComplex> c, cudaStream_t stream);
-}}}
+}

+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::mulAndScaleSpectrums(const GpuMat& a, const GpuMat& b, GpuMat& c, int flags, float scale, bool conjB, Stream& stream) 
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
    typedef void (*Caller)(const PtrStep<cufftComplex>, const PtrStep<cufftComplex>, float scale, DevMem2D_<cufftComplex>, cudaStream_t stream);
-    static Caller callers[] = { imgproc::mulAndScaleSpectrums, imgproc::mulAndScaleSpectrums_CONJ };
+    static Caller callers[] = { mulAndScaleSpectrums, mulAndScaleSpectrums_CONJ };

    CV_Assert(a.type() == b.type() && a.type() == CV_32FC2);
    CV_Assert(a.size() == b.size());
@ -1593,13 +1673,19 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
    convolve(image, templ, result, ccorr, buf);
 }

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace imgproc
 {
    void convolve_gpu(const DevMem2Df& src, const PtrStepf& dst, int kWidth, int kHeight, float* kernel, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result, bool ccorr, ConvolveBuf& buf, Stream& stream)
 {
+    using namespace OPENCV_DEVICE_NAMESPACE_ imgproc;
+
 #ifndef HAVE_CUFFT

    CV_Assert(image.type() == CV_32F);
@ -1622,7 +1708,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
            templ.copyTo(contKernel);
    }

-    imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
+    convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));

 #else

@ -1650,7 +1736,7 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
                templ.copyTo(contKernel);
        }

-        imgproc::convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
+        convolve_gpu(image, result, templ.cols, templ.rows, contKernel.ptr<float>(), StreamAccessor::getStream(stream));
    }
    else
    {
@ -1725,14 +1811,18 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 //////////////////////////////////////////////////////////////////////////////
 // pyrDown

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace pyr_down 
 {
    template <typename T, int cn> void pyrDown_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
 {
-    using namespace cv::gpu::imgproc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ pyr_down;

    typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

@ -1761,14 +1851,18 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& st
 //////////////////////////////////////////////////////////////////////////////
 // pyrUp

-namespace cv { namespace gpu { namespace imgproc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace pyr_up 
 {
    template <typename T, int cn> void pyrUp_gpu(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
 {
-    using namespace cv::gpu::imgproc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ pyr_up;

    typedef void (*func_t)(const DevMem2Db& src, const DevMem2Db& dst, int borderType, cudaStream_t stream);

@ -1839,7 +1933,9 @@ void cv::gpu::CannyBuf::release()
    trackBuf2.release();
 }

-namespace cv { namespace gpu { namespace canny
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace canny 
 {
    void calcSobelRowPass_gpu(PtrStepb src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols);

@ -1853,13 +1949,15 @@ namespace cv { namespace gpu { namespace canny
    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols);

    void getEdges_gpu(PtrStepi map, PtrStepb dst, int rows, int cols);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 namespace
 {
    void CannyCaller(CannyBuf& buf, GpuMat& dst, float low_thresh, float high_thresh)
    {
-        using namespace cv::gpu::canny;
+        using namespace OPENCV_DEVICE_NAMESPACE_ canny;

        calcMap_gpu(buf.dx, buf.dy, buf.edgeBuf, buf.edgeBuf, dst.rows, dst.cols, low_thresh, high_thresh);
        
@ -1879,7 +1977,7 @@ void cv::gpu::Canny(const GpuMat& src, GpuMat& dst, double low_thresh, double hi

 void cv::gpu::Canny(const GpuMat& src, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, int apperture_size, bool L2gradient)
 {
-    using namespace cv::gpu::canny;
+    using namespace OPENCV_DEVICE_NAMESPACE_ canny;

    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
    CV_Assert(src.type() == CV_8UC1);
@ -1918,7 +2016,7 @@ void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, GpuMat& dst, double low_

 void cv::gpu::Canny(const GpuMat& dx, const GpuMat& dy, CannyBuf& buf, GpuMat& dst, double low_thresh, double high_thresh, bool L2gradient)
 {
-    using namespace cv::gpu::canny;
+    using namespace OPENCV_DEVICE_NAMESPACE_ canny;

    CV_Assert(TargetArchs::builtWith(SHARED_ATOMICS) && DeviceInfo().supports(SHARED_ATOMICS));
    CV_Assert(dx.type() == CV_32SC1 && dy.type() == CV_32SC1 && dx.size() == dy.size());
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@ -271,5 +271,380 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
        setDevice(prev_device_id);
 }

+////////////////////////////////////////////////////////////////////
+// GpuFuncTable
+
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
+
+template <typename T>
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
+template <typename T>
+void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
+
+void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
+
+END_OPENCV_DEVICE_NAMESPACE
+
+namespace
+{
+    //////////////////////////////////////////////////////////////////////////
+    // Convert
+
+    template<int n> struct NPPTypeTraits;
+    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
+    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
+    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
+    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
+    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
+
+    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
+    };
+    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
+    };
+
+    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        static void cvt(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
+    {
+        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
+
+        static void cvt(const GpuMat& src, GpuMat& dst)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
+    {
+        OPENCV_DEVICE_NAMESPACE_ convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // Set
+    
+    template<int SDEPTH, int SCN> struct NppSetFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
+    };
+
+    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <typename T>
+    void kernelSet(GpuMat& src, Scalar s)
+    {
+        Scalar_<T> sf = s;
+        OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, src.channels(), 0);
+    }
+
+    template<int SDEPTH, int SCN> struct NppSetMaskFunc
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
+    };
+
+    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
+    {
+        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+
+        static void set(GpuMat& src, Scalar s, const GpuMat& mask)
+        {
+            NppiSize sz;
+            sz.width = src.cols;
+            sz.height = src.rows;
+
+            Scalar_<src_t> nppS = s;
+
+            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
+
+            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+    };
+
+    template <typename T>
+    void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)
+    {
+        Scalar_<T> sf = s;
+        OPENCV_DEVICE_NAMESPACE_ set_to_gpu(src, sf.val, mask, src.channels(), 0);
+    }
+
+    class CudaFuncTable : public GpuFuncTable
+    {
+    public:
+        void copy(const Mat& src, GpuMat& dst) const 
+        { 
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
+        }
+        void copy(const GpuMat& src, Mat& dst) const
+        { 
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
+        }
+        void copy(const GpuMat& src, GpuMat& dst) const
+        { 
+            cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
+        }
+
+        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const 
+        { 
+            OPENCV_DEVICE_NAMESPACE_ copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
+        }
+
+        void convert(const GpuMat& src, GpuMat& dst) const 
+        { 
+            typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
+            static const caller_t callers[7][7][7] =
+            {
+                {                
+                    /*  8U ->  8U */ {0, 0, 0, 0},
+                    /*  8U ->  8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
+                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
+                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
+                    /*  8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
+                },
+                {
+                    /*  8S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8S ->  8S */ {0,0,0,0},
+                    /*  8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /*  8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
+                },
+                {
+                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
+                    /* 16U ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16U -> 16U */ {0,0,0,0},
+                    /* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
+                },
+                {
+                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
+                    /* 16S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16S -> 16S */ {0,0,0,0},
+                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
+                },
+                {
+                    /* 32S ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32S ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32S -> 32S */ {0,0,0,0},
+                    /* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
+                },
+                {
+                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 32F -> 32F */ {0,0,0,0},
+                    /* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
+                },
+                {
+                    /* 64F ->  8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 64F ->  8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
+                    /* 64F -> 64F */ {0,0,0,0}
+                }
+            };
+
+            caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
+            CV_DbgAssert(func != 0);
+
+            func(src, dst);
+        }
+
+        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const 
+        { 
+            device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
+        }
+
+        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
+        {
+            NppiSize sz;
+            sz.width  = m.cols;
+            sz.height = m.rows;
+
+            if (mask.empty())
+            {
+                if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
+                {
+                    cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
+                    return;
+                }
+
+                if (m.depth() == CV_8U)
+                {
+                    int cn = m.channels();
+
+                    if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
+                    {
+                        int val = saturate_cast<uchar>(s[0]);
+                        cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
+                        return;
+                    }
+                }
+
+                typedef void (*caller_t)(GpuMat& src, Scalar s);
+                static const caller_t callers[7][4] =
+                {
+                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
+                    {kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
+                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
+                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
+                    {kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}
+                };
+
+                callers[m.depth()][m.channels() - 1](m, s);
+            }
+            else
+            {
+                typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);
+
+                static const caller_t callers[7][4] =
+                {
+                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
+                    {kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
+                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
+                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
+                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
+                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
+                    {kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}
+                };
+
+                callers[m.depth()][m.channels() - 1](m, s, mask);
+            }
+        }
+
+        void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
+        {
+            cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
+        }
+
+        void free(void* devPtr) const
+        {
+            cudaFree(devPtr);
+        }
+    };
+
+    class Initializer
+    {
+    public:
+        Initializer()
+        {
+            static CudaFuncTable funcTable;
+            setGpuFuncTable(&funcTable);
+        }
+    };
+
+    Initializer init;
+}
+
 #endif

--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
@ -44,6 +44,7 @@

 using namespace cv;
 using namespace cv::gpu;
+using namespace std;

 #if !defined (HAVE_CUDA)

@ -51,7 +52,9 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int, Stream&)

 #else

-namespace cv { namespace gpu { namespace imgproc 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace match_template 
 {
    void matchTemplateNaive_CCORR_8U(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
    void matchTemplateNaive_CCORR_32F(const DevMem2Db image, const DevMem2Db templ, DevMem2Df result, int cn, cudaStream_t stream);
@ -132,8 +135,11 @@ namespace cv { namespace gpu { namespace imgproc
                      unsigned int templ_sqsum, DevMem2Df result, int cn, cudaStream_t stream);

    void extractFirstChannel_32F(const DevMem2Db image, DevMem2Df result, int cn, cudaStream_t stream);
-}}}
+}

+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE_ match_template;

 namespace 
 {
@ -177,14 +183,14 @@ namespace
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
        if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_32F))
        {
-            imgproc::matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            matchTemplateNaive_CCORR_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

        GpuMat result_;
        ConvolveBuf buf;
        convolve(image.reshape(1), templ.reshape(1), result_, true, buf, stream);
-        imgproc::extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
+        extractFirstChannel_32F(result_, result, image.channels(), StreamAccessor::getStream(stream));
    }


@ -193,7 +199,7 @@ namespace
        if (templ.size().area() < getTemplateThreshold(CV_TM_CCORR, CV_8U))
        {
            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            imgproc::matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            matchTemplateNaive_CCORR_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

@ -220,15 +226,14 @@ namespace
        sqrIntegral(image.reshape(1), img_sqsum, stream);

        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];
-        imgproc::normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, 
-                              result, image.channels(), StreamAccessor::getStream(stream));
+        normalize_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }

    
    void matchTemplate_SQDIFF_32F(const GpuMat& image, const GpuMat& templ, GpuMat& result, Stream& stream)
    {
        result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-        imgproc::matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+        matchTemplateNaive_SQDIFF_32F(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
    }


@ -237,7 +242,7 @@ namespace
        if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, CV_8U))
        {
            result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
-            imgproc::matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
+            matchTemplateNaive_SQDIFF_8U(image, templ, result, image.channels(), StreamAccessor::getStream(stream));
            return;
        }

@ -247,8 +252,7 @@ namespace
        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];

        matchTemplate_CCORR_8U(image, templ, result, stream);
-        imgproc::matchTemplatePrepared_SQDIFF_8U(
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        matchTemplatePrepared_SQDIFF_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }


@ -260,8 +264,7 @@ namespace
        unsigned int templ_sqsum = (unsigned int)sqrSum(templ.reshape(1))[0];

        matchTemplate_CCORR_8U(image, templ, result, stream);
-        imgproc::matchTemplatePrepared_SQDIFF_NORMED_8U(
-                templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
+        matchTemplatePrepared_SQDIFF_NORMED_8U(templ.cols, templ.rows, img_sqsum, templ_sqsum, result, image.channels(), StreamAccessor::getStream(stream));
    }


@ -275,13 +278,12 @@ namespace
            integral(image, image_sum, stream);

            unsigned int templ_sum = (unsigned int)sum(templ)[0];
-            imgproc::matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, 
-                                                    image_sum, templ_sum, result, StreamAccessor::getStream(stream));
+            matchTemplatePrepared_CCOFF_8U(templ.cols, templ.rows, image_sum, templ_sum, result, StreamAccessor::getStream(stream));
        }
        else
        {
-            std::vector<GpuMat> images;
-            std::vector<GpuMat> image_sums(image.channels());
+            vector<GpuMat> images;
+            vector<GpuMat> image_sums(image.channels());

            split(image, images);
            for (int i = 0; i < image.channels(); ++i)
@ -292,19 +294,19 @@ namespace
            switch (image.channels())
            {
            case 2:
-                imgproc::matchTemplatePrepared_CCOFF_8UC2(
+                matchTemplatePrepared_CCOFF_8UC2(
                        templ.cols, templ.rows, image_sums[0], image_sums[1],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1],
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
-                imgproc::matchTemplatePrepared_CCOFF_8UC3(
+                matchTemplatePrepared_CCOFF_8UC3(
                        templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
-                imgproc::matchTemplatePrepared_CCOFF_8UC4(
+                matchTemplatePrepared_CCOFF_8UC4(
                        templ.cols, templ.rows, image_sums[0], image_sums[1], image_sums[2], image_sums[3],
                        (unsigned int)templ_sum[0], (unsigned int)templ_sum[1], (unsigned int)templ_sum[2],
                        (unsigned int)templ_sum[3], result, StreamAccessor::getStream(stream));
@ -341,15 +343,15 @@ namespace
            unsigned int templ_sum = (unsigned int)sum(templ)[0];
            unsigned int templ_sqsum = (unsigned int)sqrSum(templ)[0];

-            imgproc::matchTemplatePrepared_CCOFF_NORMED_8U(
+            matchTemplatePrepared_CCOFF_NORMED_8U(
                    templ.cols, templ.rows, image_sum, image_sqsum, 
                    templ_sum, templ_sqsum, result, StreamAccessor::getStream(stream));
        }
        else
        {
-            std::vector<GpuMat> images;
-            std::vector<GpuMat> image_sums(image.channels());
-            std::vector<GpuMat> image_sqsums(image.channels());
+            vector<GpuMat> images;
+            vector<GpuMat> image_sums(image.channels());
+            vector<GpuMat> image_sqsums(image.channels());

            split(image, images);
            for (int i = 0; i < image.channels(); ++i)
@ -364,7 +366,7 @@ namespace
            switch (image.channels())
            {
            case 2:
-                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC2(
+                matchTemplatePrepared_CCOFF_NORMED_8UC2(
                        templ.cols, templ.rows, 
                        image_sums[0], image_sqsums[0],
                        image_sums[1], image_sqsums[1],
@ -373,7 +375,7 @@ namespace
                        result, StreamAccessor::getStream(stream));
                break;
            case 3:
-                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC3(
+                matchTemplatePrepared_CCOFF_NORMED_8UC3(
                        templ.cols, templ.rows, 
                        image_sums[0], image_sqsums[0],
                        image_sums[1], image_sqsums[1],
@ -384,7 +386,7 @@ namespace
                        result, StreamAccessor::getStream(stream));
                break;
            case 4:
-                imgproc::matchTemplatePrepared_CCOFF_NORMED_8UC4(
+                matchTemplatePrepared_CCOFF_NORMED_8UC4(
                        templ.cols, templ.rows, 
                        image_sums[0], image_sqsums[0],
                        image_sums[1], image_sqsums[1],
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@ -45,6 +45,139 @@
 using namespace cv;
 using namespace cv::gpu;

+cv::gpu::CudaMem::CudaMem() 
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0) 
+{
+}
+
+cv::gpu::CudaMem::CudaMem(int _rows, int _cols, int _type, int _alloc_type) 
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
+{
+    if( _rows > 0 && _cols > 0 )
+        create( _rows, _cols, _type, _alloc_type);
+}
+
+cv::gpu::CudaMem::CudaMem(Size _size, int _type, int _alloc_type) 
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
+{
+    if( _size.height > 0 && _size.width > 0 )
+        create( _size.height, _size.width, _type, _alloc_type);
+}
+
+cv::gpu::CudaMem::CudaMem(const CudaMem& m) 
+    : flags(m.flags), rows(m.rows), cols(m.cols), step(m.step), data(m.data), refcount(m.refcount), datastart(m.datastart), dataend(m.dataend), alloc_type(m.alloc_type)
+{
+    if( refcount )
+        CV_XADD(refcount, 1);
+}
+
+cv::gpu::CudaMem::CudaMem(const Mat& m, int _alloc_type) 
+    : flags(0), rows(0), cols(0), step(0), data(0), refcount(0), datastart(0), dataend(0), alloc_type(0)
+{
+    if( m.rows > 0 && m.cols > 0 )
+        create( m.size(), m.type(), _alloc_type);
+
+    Mat tmp = createMatHeader();
+    m.copyTo(tmp);
+}
+
+cv::gpu::CudaMem::~CudaMem()
+{
+    release();
+}
+
+CudaMem& cv::gpu::CudaMem::operator = (const CudaMem& m)
+{
+    if( this != &m )
+    {
+        if( m.refcount )
+            CV_XADD(m.refcount, 1);
+        release();
+        flags = m.flags;
+        rows = m.rows; cols = m.cols;
+        step = m.step; data = m.data;
+        datastart = m.datastart;
+        dataend = m.dataend;
+        refcount = m.refcount;
+        alloc_type = m.alloc_type;
+    }
+    return *this;
+}
+
+CudaMem cv::gpu::CudaMem::clone() const
+{
+    CudaMem m(size(), type(), alloc_type);
+    Mat to = m;
+    Mat from = *this;
+    from.copyTo(to);
+    return m;
+}
+
+void cv::gpu::CudaMem::create(Size _size, int _type, int _alloc_type) 
+{ 
+    create(_size.height, _size.width, _type, _alloc_type); 
+}
+
+Mat cv::gpu::CudaMem::createMatHeader() const 
+{ 
+    return Mat(size(), type(), data, step); 
+}
+
+cv::gpu::CudaMem::operator Mat() const 
+{ 
+    return createMatHeader(); 
+}
+
+cv::gpu::CudaMem::operator GpuMat() const 
+{ 
+    return createGpuMatHeader(); 
+}
+
+bool cv::gpu::CudaMem::isContinuous() const 
+{ 
+    return (flags & Mat::CONTINUOUS_FLAG) != 0; 
+}
+
+size_t cv::gpu::CudaMem::elemSize() const 
+{ 
+    return CV_ELEM_SIZE(flags); 
+}
+
+size_t cv::gpu::CudaMem::elemSize1() const 
+{ 
+    return CV_ELEM_SIZE1(flags); 
+}
+
+int cv::gpu::CudaMem::type() const 
+{ 
+    return CV_MAT_TYPE(flags); 
+}
+
+int cv::gpu::CudaMem::depth() const 
+{ 
+    return CV_MAT_DEPTH(flags); 
+}
+
+int cv::gpu::CudaMem::channels() const 
+{ 
+    return CV_MAT_CN(flags); 
+}
+
+size_t cv::gpu::CudaMem::step1() const 
+{ 
+    return step/elemSize1(); 
+}
+
+Size cv::gpu::CudaMem::size() const 
+{ 
+    return Size(cols, rows); 
+}
+
+bool cv::gpu::CudaMem::empty() const 
+{ 
+    return data == 0; 
+}
+
 #if !defined (HAVE_CUDA)

 void cv::gpu::registerPageLocked(Mat&) { throw_nogpu(); }
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@ -190,32 +190,35 @@ double cv::gpu::norm(const GpuMat& src1, const GpuMat& src2, int normType)
 ////////////////////////////////////////////////////////////////////////
 // Sum

-namespace cv { namespace gpu { namespace mathfunc
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace matrix_reductions 
 {
-    template <typename T>
-    void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
-
-    template <typename T>
-    void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
-
-    template <typename T>
-    void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
-
-    template <typename T>
-    void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
-
-    template <typename T>
-    void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
-
-    template <typename T>
-    void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
-
-    namespace sums
+    namespace sum
    {
+        template <typename T>
+        void sumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+
+        template <typename T>
+        void sumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+
+        template <typename T>
+        void absSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+
+        template <typename T>
+        void absSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+
+        template <typename T>
+        void sqrSumCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+
+        template <typename T>
+        void sqrSumMultipassCaller(const DevMem2Db src, PtrStepb buf, double* sum, int cn);
+
        void getBufSizeRequired(int cols, int rows, int cn, int& bufcols, int& bufrows);
    }
-}}}
+}

+END_OPENCV_DEVICE_NAMESPACE

 Scalar cv::gpu::sum(const GpuMat& src) 
 {
@ -226,23 +229,25 @@ Scalar cv::gpu::sum(const GpuMat& src)

 Scalar cv::gpu::sum(const GpuMat& src, GpuMat& buf) 
 {
-    using namespace mathfunc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;

    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);

-    static Caller multipass_callers[7] = { 
-            sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, 
-            sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, 
-            sumMultipassCaller<int>, sumMultipassCaller<float>, 0 };
+    static Caller multipass_callers[7] = 
+    { 
+        sumMultipassCaller<unsigned char>, sumMultipassCaller<char>, 
+        sumMultipassCaller<unsigned short>, sumMultipassCaller<short>, 
+        sumMultipassCaller<int>, sumMultipassCaller<float>, 0 
+    };

    static Caller singlepass_callers[7] = { 
-            sumCaller<unsigned char>, sumCaller<char>, 
-            sumCaller<unsigned short>, sumCaller<short>, 
-            sumCaller<int>, sumCaller<float>, 0 };
+        sumCaller<unsigned char>, sumCaller<char>, 
+        sumCaller<unsigned short>, sumCaller<short>, 
+        sumCaller<int>, sumCaller<float>, 0 
+    };

    Size buf_size;
-    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), 
-                             buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller* callers = multipass_callers;
@ -267,23 +272,26 @@ Scalar cv::gpu::absSum(const GpuMat& src)

 Scalar cv::gpu::absSum(const GpuMat& src, GpuMat& buf) 
 {
-    using namespace mathfunc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;

    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);

-    static Caller multipass_callers[7] = { 
-            absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, 
-            absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, 
-            absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 };
+    static Caller multipass_callers[7] = 
+    { 
+        absSumMultipassCaller<unsigned char>, absSumMultipassCaller<char>, 
+        absSumMultipassCaller<unsigned short>, absSumMultipassCaller<short>, 
+        absSumMultipassCaller<int>, absSumMultipassCaller<float>, 0 
+    };

-    static Caller singlepass_callers[7] = { 
-            absSumCaller<unsigned char>, absSumCaller<char>, 
-            absSumCaller<unsigned short>, absSumCaller<short>, 
-            absSumCaller<int>, absSumCaller<float>, 0 };
+    static Caller singlepass_callers[7] = 
+    {        
+        absSumCaller<unsigned char>, absSumCaller<char>, 
+        absSumCaller<unsigned short>, absSumCaller<short>, 
+        absSumCaller<int>, absSumCaller<float>, 0 
+    };

    Size buf_size;
-    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), 
-                             buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller* callers = multipass_callers;
@ -308,27 +316,30 @@ Scalar cv::gpu::sqrSum(const GpuMat& src)

 Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf) 
 {
-    using namespace mathfunc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::sum;

    typedef void (*Caller)(const DevMem2Db, PtrStepb, double*, int);

-    static Caller multipass_callers[7] = { 
-            sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, 
-            sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, 
-            sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 };
+    static Caller multipass_callers[7] = 
+    { 
+        sqrSumMultipassCaller<unsigned char>, sqrSumMultipassCaller<char>, 
+        sqrSumMultipassCaller<unsigned short>, sqrSumMultipassCaller<short>, 
+        sqrSumMultipassCaller<int>, sqrSumMultipassCaller<float>, 0 
+    };

-    static Caller singlepass_callers[7] = { 
-            sqrSumCaller<unsigned char>, sqrSumCaller<char>, 
-            sqrSumCaller<unsigned short>, sqrSumCaller<short>, 
-            sqrSumCaller<int>, sqrSumCaller<float>, 0 };
+    static Caller singlepass_callers[7] = 
+    { 
+        sqrSumCaller<unsigned char>, sqrSumCaller<char>, 
+        sqrSumCaller<unsigned short>, sqrSumCaller<short>, 
+        sqrSumCaller<int>, sqrSumCaller<float>, 0 
+    };

    Caller* callers = multipass_callers;
    if (TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS))
        callers = singlepass_callers;

    Size buf_size;
-    sums::getBufSizeRequired(src.cols, src.rows, src.channels(), 
-                             buf_size.width, buf_size.height); 
+    getBufSizeRequired(src.cols, src.rows, src.channels(), buf_size.width, buf_size.height); 
    ensureSizeIsEnough(buf_size, CV_8U, buf);

    Caller caller = callers[src.depth()];
@ -339,29 +350,32 @@ Scalar cv::gpu::sqrSum(const GpuMat& src, GpuMat& buf)
    return Scalar(result[0], result[1], result[2], result[3]);
 }

-
-
-
 ////////////////////////////////////////////////////////////////////////
 // Find min or max

-namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
+namespace matrix_reductions 
+{
+    namespace minmax 
+    {
+        void getBufSizeRequired(int cols, int rows, int elem_size, int& bufcols, int& bufrows);
        
-    template <typename T> 
-    void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
+        template <typename T> 
+        void minMaxCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);

-    template <typename T> 
-    void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+        template <typename T> 
+        void minMaxMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);

-    template <typename T> 
-    void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);
+        template <typename T> 
+        void minMaxMultipassCaller(const DevMem2Db src, double* minval, double* maxval, PtrStepb buf);

-    template <typename T> 
-    void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+        template <typename T> 
+        void minMaxMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, PtrStepb buf);
+    }
+}

-}}}}
+END_OPENCV_DEVICE_NAMESPACE


 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask)
@ -373,39 +387,43 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp

 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const GpuMat& mask, GpuMat& buf)
 {
-    using namespace mathfunc::minmax;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmax;

    typedef void (*Caller)(const DevMem2Db, double*, double*, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, PtrStepb);

-    static Caller multipass_callers[7] = { 
-            minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, 
-            minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, 
-            minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 };
+    static Caller multipass_callers[7] = 
+    { 
+        minMaxMultipassCaller<unsigned char>, minMaxMultipassCaller<char>, 
+        minMaxMultipassCaller<unsigned short>, minMaxMultipassCaller<short>, 
+        minMaxMultipassCaller<int>, minMaxMultipassCaller<float>, 0 
+    };

-    static Caller singlepass_callers[7] = { 
-            minMaxCaller<unsigned char>, minMaxCaller<char>, 
-            minMaxCaller<unsigned short>, minMaxCaller<short>, 
-            minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> };
+    static Caller singlepass_callers[7] = 
+    { 
+        minMaxCaller<unsigned char>, minMaxCaller<char>, 
+        minMaxCaller<unsigned short>, minMaxCaller<short>, 
+        minMaxCaller<int>, minMaxCaller<float>, minMaxCaller<double> 
+    };

-    static MaskedCaller masked_multipass_callers[7] = { 
-            minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, 
-            minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
-            minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0 };
+    static MaskedCaller masked_multipass_callers[7] = 
+    { 
+        minMaxMaskMultipassCaller<unsigned char>, minMaxMaskMultipassCaller<char>, 
+        minMaxMaskMultipassCaller<unsigned short>, minMaxMaskMultipassCaller<short>,
+        minMaxMaskMultipassCaller<int>, minMaxMaskMultipassCaller<float>, 0
+    };

-    static MaskedCaller masked_singlepass_callers[7] = { 
-            minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, 
-            minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, 
-            minMaxMaskCaller<int>, minMaxMaskCaller<float>, 
-            minMaxMaskCaller<double> };
+    static MaskedCaller masked_singlepass_callers[7] =
+    { 
+        minMaxMaskCaller<unsigned char>, minMaxMaskCaller<char>, 
+        minMaxMaskCaller<unsigned short>, minMaxMaskCaller<short>, 
+        minMaxMaskCaller<int>, minMaxMaskCaller<float>, minMaxMaskCaller<double> 
+    };

    CV_Assert(src.channels() == 1);

    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));

-    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && 
-                                       DeviceInfo().supports(NATIVE_DOUBLE)));
-
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    
@ -439,28 +457,34 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, const Gp
 ////////////////////////////////////////////////////////////////////////
 // Locate min and max

-namespace cv { namespace gpu { namespace mathfunc { namespace minmaxloc {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
-                               int& b1rows, int& b2cols, int& b2rows);
+namespace matrix_reductions 
+{
+    namespace minmaxloc 
+    {
+        void getBufSizeRequired(int cols, int rows, int elem_size, int& b1cols, 
+                                int& b1rows, int& b2cols, int& b2rows);

-    template <typename T> 
-    void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
-                            int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+        template <typename T> 
+        void minMaxLocCaller(const DevMem2Db src, double* minval, double* maxval, 
+                             int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

-    template <typename T> 
-    void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+        template <typename T> 
+        void minMaxLocMaskCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
                                 int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

-    template <typename T> 
-    void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
-                                     int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+        template <typename T> 
+        void minMaxLocMultipassCaller(const DevMem2Db src, double* minval, double* maxval, 
+                                      int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);

-    template <typename T> 
-    void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
-                                           int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
-}}}}
+        template <typename T> 
+        void minMaxLocMaskMultipassCaller(const DevMem2Db src, const PtrStepb mask, double* minval, double* maxval, 
+                                          int minloc[2], int maxloc[2], PtrStepb valBuf, PtrStepb locBuf);
+    }
+}

+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, const GpuMat& mask)
 {    
@ -468,43 +492,46 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
    minMaxLoc(src, minVal, maxVal, minLoc, maxLoc, mask, valBuf, locBuf);
 }

-
 void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc,
                        const GpuMat& mask, GpuMat& valBuf, GpuMat& locBuf)
 {
-    using namespace mathfunc::minmaxloc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::minmaxloc;

    typedef void (*Caller)(const DevMem2Db, double*, double*, int[2], int[2], PtrStepb, PtrStepb);
    typedef void (*MaskedCaller)(const DevMem2Db, const PtrStepb, double*, double*, int[2], int[2], PtrStepb, PtrStepb);

-    static Caller multipass_callers[7] = { 
-            minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, 
-            minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, 
-            minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 };
+    static Caller multipass_callers[7] = 
+    {
+        minMaxLocMultipassCaller<unsigned char>, minMaxLocMultipassCaller<char>, 
+        minMaxLocMultipassCaller<unsigned short>, minMaxLocMultipassCaller<short>, 
+        minMaxLocMultipassCaller<int>, minMaxLocMultipassCaller<float>, 0 
+    };

-    static Caller singlepass_callers[7] = { 
-            minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, 
-            minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, 
-            minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> };
+    static Caller singlepass_callers[7] = 
+    {
+        minMaxLocCaller<unsigned char>, minMaxLocCaller<char>, 
+        minMaxLocCaller<unsigned short>, minMaxLocCaller<short>, 
+        minMaxLocCaller<int>, minMaxLocCaller<float>, minMaxLocCaller<double> 
+    };

-    static MaskedCaller masked_multipass_callers[7] = { 
-            minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>, 
-            minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, 
-            minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 };
+    static MaskedCaller masked_multipass_callers[7] = 
+    {
+        minMaxLocMaskMultipassCaller<unsigned char>, minMaxLocMaskMultipassCaller<char>,
+        minMaxLocMaskMultipassCaller<unsigned short>, minMaxLocMaskMultipassCaller<short>, 
+        minMaxLocMaskMultipassCaller<int>, minMaxLocMaskMultipassCaller<float>, 0 
+    };

-    static MaskedCaller masked_singlepass_callers[7] = { 
-            minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, 
-            minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, 
-            minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, 
-            minMaxLocMaskCaller<double> };
+    static MaskedCaller masked_singlepass_callers[7] = 
+    { 
+        minMaxLocMaskCaller<unsigned char>, minMaxLocMaskCaller<char>, 
+        minMaxLocMaskCaller<unsigned short>, minMaxLocMaskCaller<short>, 
+        minMaxLocMaskCaller<int>, minMaxLocMaskCaller<float>, minMaxLocMaskCaller<double> 
+    };

    CV_Assert(src.channels() == 1);

    CV_Assert(mask.empty() || (mask.type() == CV_8U && src.size() == mask.size()));

-    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && 
-                                       DeviceInfo().supports(NATIVE_DOUBLE)));
-
    double minVal_; if (!minVal) minVal = &minVal_;
    double maxVal_; if (!maxVal) maxVal = &maxVal_;
    int minLoc_[2];
@ -544,18 +571,23 @@ void cv::gpu::minMaxLoc(const GpuMat& src, double* minVal, double* maxVal, Point
 //////////////////////////////////////////////////////////////////////////////
 // Count non-zero elements

-namespace cv { namespace gpu { namespace mathfunc { namespace countnonzero {
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);
+namespace matrix_reductions 
+{
+    namespace countnonzero 
+    {
+        void getBufSizeRequired(int cols, int rows, int& bufcols, int& bufrows);

-    template <typename T> 
-    int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);
+        template <typename T> 
+        int countNonZeroCaller(const DevMem2Db src, PtrStepb buf);

-    template <typename T> 
-    int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
-
-}}}}
+        template <typename T> 
+        int countNonZeroMultipassCaller(const DevMem2Db src, PtrStepb buf);
+    }
+}

+END_OPENCV_DEVICE_NAMESPACE

 int cv::gpu::countNonZero(const GpuMat& src)
 {
@ -566,26 +598,25 @@ int cv::gpu::countNonZero(const GpuMat& src)

 int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
 {
-    using namespace mathfunc::countnonzero;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions::countnonzero;

    typedef int (*Caller)(const DevMem2Db src, PtrStepb buf);

-    static Caller multipass_callers[7] = { 
-            countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
-            countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
-            countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 };
+    static Caller multipass_callers[7] = 
+    {
+        countNonZeroMultipassCaller<unsigned char>, countNonZeroMultipassCaller<char>,
+        countNonZeroMultipassCaller<unsigned short>, countNonZeroMultipassCaller<short>,
+        countNonZeroMultipassCaller<int>, countNonZeroMultipassCaller<float>, 0 
+    };

-    static Caller singlepass_callers[7] = { 
-            countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
-            countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
-            countNonZeroCaller<int>, countNonZeroCaller<float>, 
-            countNonZeroCaller<double> };
+    static Caller singlepass_callers[7] = 
+    {
+        countNonZeroCaller<unsigned char>, countNonZeroCaller<char>,
+        countNonZeroCaller<unsigned short>, countNonZeroCaller<short>,
+        countNonZeroCaller<int>, countNonZeroCaller<float>, countNonZeroCaller<double> };

    CV_Assert(src.channels() == 1);

-    CV_Assert(src.type() != CV_64F || (TargetArchs::builtWith(NATIVE_DOUBLE) && 
-                                       DeviceInfo().supports(NATIVE_DOUBLE)));
-
    Size buf_size;
    getBufSizeRequired(src.cols, src.rows, buf_size.width, buf_size.height);
    ensureSizeIsEnough(buf_size, CV_8U, buf);
@ -601,15 +632,20 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)

 //////////////////////////////////////////////////////////////////////////////
 // reduce
+BEGIN_OPENCV_DEVICE_NAMESPACE

-namespace cv { namespace gpu { namespace mathfunc {
+namespace matrix_reductions 
+{
    template <typename T, typename S, typename D> void reduceRows_gpu(const DevMem2Db& src, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
    template <typename T, typename S, typename D> void reduceCols_gpu(const DevMem2Db& src, int cn, const DevMem2Db& dst, int reduceOp, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 void cv::gpu::reduce(const GpuMat& src, GpuMat& dst, int dim, int reduceOp, int dtype, Stream& stream)
 {
-    using namespace cv::gpu::mathfunc;
+    using namespace OPENCV_DEVICE_NAMESPACE_ matrix_reductions;
+
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4 && dtype <= CV_32F);
    CV_Assert(dim == 0 || dim == 1);
    CV_Assert(reduceOp == CV_REDUCE_SUM || reduceOp == CV_REDUCE_AVG || reduceOp == CV_REDUCE_MAX || reduceOp == CV_REDUCE_MIN);
--- a/modules/gpu/src/mssegmentation.cpp
+++ b/modules/gpu/src/mssegmentation.cpp
@ -234,10 +234,10 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
    const int hsp = sp;

    // Perform mean shift procedure and obtain region and spatial maps
-    GpuMat h_rmap, h_spmap;
-    meanShiftProc(src, h_rmap, h_spmap, sp, sr, criteria);
-    Mat rmap = h_rmap;
-    Mat spmap = h_spmap;
+    GpuMat d_rmap, d_spmap;
+    meanShiftProc(src, d_rmap, d_spmap, sp, sr, criteria);
+    Mat rmap(d_rmap);
+    Mat spmap(d_spmap);

    Graph<SegmLinkVal> g(nrows * ncols, 4 * (nrows - 1) * (ncols - 1)
                                        + (nrows - 1) + (ncols - 1));
@ -352,7 +352,7 @@ void cv::gpu::meanShiftSegmentation(const GpuMat& src, Mat& dst, int sp, int sr,
    }

    // Compute sum of the pixel's colors which are in the same segment
-    Mat h_src = src;
+    Mat h_src(src);
    vector<Vec4i> sumcols(nrows * ncols, Vec4i(0, 0, 0, 0));
    for (int y = 0; y < nrows; ++y)
    {
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
--- a/modules/gpu/src/opencv2/gpu/device/color.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/color.hpp
@ -43,179 +43,181 @@
 #ifndef __OPENCV_GPU_COLOR_HPP__
 #define __OPENCV_GPU_COLOR_HPP__

+#include "internal_shared.hpp"
 #include "detail/color_detail.hpp"

-namespace cv { namespace gpu { namespace device
-{
-    // All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
-    // template <typename T> class ColorSpace1_to_ColorSpace2_traits
-    // {
-    //     typedef ... functor_type;
-    //     static __host__ __device__ functor_type create_functor();
-    // };
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)
+// All OPENCV_GPU_IMPLEMENT_*_TRAITS(ColorSpace1_to_ColorSpace2, ...) macros implements
+// template <typename T> class ColorSpace1_to_ColorSpace2_traits
+// {
+//     typedef ... functor_type;
+//     static __host__ __device__ functor_type create_functor();
+// };
+
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgb, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_bgra, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgr_to_rgba, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_bgr, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgb, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS(bgra_to_rgba, 4, 4, 2)

 #undef OPENCV_GPU_IMPLEMENT_RGB2RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr555, 3, 0, 5)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgr_to_bgr565, 3, 0, 6)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr555, 3, 2, 5)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgb_to_bgr565, 3, 2, 6)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr555, 4, 0, 5)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(bgra_to_bgr565, 4, 0, 6)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr555, 4, 2, 5)
+OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS(rgba_to_bgr565, 4, 2, 6)

 #undef OPENCV_GPU_IMPLEMENT_RGB2RGB5x5_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgb, 3, 2, 5)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgb, 3, 2, 6)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgr, 3, 0, 5)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgr, 3, 0, 6)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_rgba, 4, 2, 5)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_rgba, 4, 2, 6)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr555_to_bgra, 4, 0, 5)
+OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS(bgr565_to_bgra, 4, 0, 6)

 #undef OPENCV_GPU_IMPLEMENT_RGB5x52RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)
+OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgr, 3)
+OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS(gray_to_bgra, 4)

 #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
-    OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)
+OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr555, 5)
+OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS(gray_to_bgr565, 6)

 #undef OPENCV_GPU_IMPLEMENT_GRAY2RGB5x5_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
-    OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)
+OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr555_to_gray, 5)
+OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS(bgr565_to_gray, 6)

 #undef OPENCV_GPU_IMPLEMENT_RGB5x52GRAY_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgb_to_gray, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgr_to_gray, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(rgba_to_gray, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS(bgra_to_gray, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_RGB2GRAY_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgb_to_yuv4, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(rgba_to_yuv4, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgr_to_yuv4, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS(bgra_to_yuv4, 4, 4, 2)

 #undef OPENCV_GPU_IMPLEMENT_RGB2YUV_TRAITS

-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgb, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_rgba, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgb, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_rgba, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgr, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv_to_bgra, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgr, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS(yuv4_to_bgra, 4, 4, 2)

 #undef OPENCV_GPU_IMPLEMENT_YUV2RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgb_to_YCrCb4, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(rgba_to_YCrCb4, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgr_to_YCrCb4, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS(bgra_to_YCrCb4, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_RGB2YCrCb_TRAITS

-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgb, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_rgba, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgb, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_rgba, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgr, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb_to_bgra, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgr, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS(YCrCb4_to_bgra, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_YCrCb2RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgb_to_xyz4, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(rgba_to_xyz4, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgr_to_xyz4, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS(bgra_to_xyz4, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_RGB2XYZ_TRAITS

-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgb, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgb, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_rgba, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_rgba, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgr, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgr, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz_to_bgra, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS(xyz4_to_bgra, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_XYZ2RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgb_to_hsv4, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(rgba_to_hsv4, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgr_to_hsv4, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS(bgra_to_hsv4, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_RGB2HSV_TRAITS

-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgb, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_rgba, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgb, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_rgba, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgr, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv_to_bgra, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgr, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS(hsv4_to_bgra, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_HSV2RGB_TRAITS

-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgb_to_hls4, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(rgba_to_hls4, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgr_to_hls4, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS(bgra_to_hls4, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_RGB2HLS_TRAITS

-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
-    OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgb, 3, 3, 2)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_rgba, 3, 4, 2)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgb, 4, 3, 2)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_rgba, 4, 4, 2)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgr, 3, 3, 0)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls_to_bgra, 3, 4, 0)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgr, 4, 3, 0)
+OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS(hls4_to_bgra, 4, 4, 0)

 #undef OPENCV_GPU_IMPLEMENT_HLS2RGB_TRAITS
-}}}
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@ -45,6 +45,8 @@

 #include "internal_shared.hpp"

+BEGIN_OPENCV_DEVICE_NAMESPACE
+
 #if defined(_WIN64) || defined(__LP64__)		
    // 64-bit register modifier for inlined asm
    #define OPENCV_GPU_ASM_PTR "l"
@ -53,8 +55,6 @@
    #define OPENCV_GPU_ASM_PTR "r"
 #endif

-namespace cv { namespace gpu { namespace device
-{
    #if __CUDA_ARCH__ >= 200

        // for Fermi memory space is detected automatically
@ -99,6 +99,7 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
        
    #endif // __CUDA_ARCH__ >= 200
-}}}
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
@ -47,364 +47,365 @@
 #include "../vec_traits.hpp"
 #include "../functional.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace detail
 {
-    namespace detail
+    //! Mask accessor
+
+    struct MaskReader
    {
-        //! Mask accessor
+        explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}

-        struct MaskReader
-        {
-            explicit MaskReader(const PtrStepb& mask_): mask(mask_) {}
+        __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }

-            __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
+        const PtrStepb mask;
+    };

-            const PtrStepb mask;
-        };
+    struct NoMask 
+    {
+        __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
+    };

-        struct NoMask 
-        {
-            __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
-        };
+    //! Read Write Traits

-        //! Read Write Traits
+    template <typename T, typename D, int shift> struct UnaryReadWriteTraits
+    {
+        typedef typename TypeVec<T, shift>::vec_type read_type;
+        typedef typename TypeVec<D, shift>::vec_type write_type;
+    };

-        template <typename T, typename D, int shift> struct UnaryReadWriteTraits
-        {
-            typedef typename TypeVec<T, shift>::vec_type read_type;
-            typedef typename TypeVec<D, shift>::vec_type write_type;
-        };
+    template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
+    {
+        typedef typename TypeVec<T1, shift>::vec_type read_type1;
+        typedef typename TypeVec<T2, shift>::vec_type read_type2;
+        typedef typename TypeVec<D, shift>::vec_type write_type;
+    };

-        template <typename T1, typename T2, typename D, int shift> struct BinaryReadWriteTraits
-        {
-            typedef typename TypeVec<T1, shift>::vec_type read_type1;
-            typedef typename TypeVec<T2, shift>::vec_type read_type2;
-            typedef typename TypeVec<D, shift>::vec_type write_type;
-        };
-
-        //! Transform kernels
-
-        template <int shift> struct OpUnroller;
-        template <> struct OpUnroller<1>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-            }
-        };
-        template <> struct OpUnroller<2>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src.y);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src1.y, src2.y);
-            }
-        };
-        template <> struct OpUnroller<3>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src.z);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src1.y, src2.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src1.z, src2.z);
-            }
-        };
-        template <> struct OpUnroller<4>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src.z);
-                if (mask(y, x_shifted + 3))
-                    dst.w = op(src.w);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.x = op(src1.x, src2.x);
-                if (mask(y, x_shifted + 1))
-                    dst.y = op(src1.y, src2.y);
-                if (mask(y, x_shifted + 2))
-                    dst.z = op(src1.z, src2.z);
-                if (mask(y, x_shifted + 3))
-                    dst.w = op(src1.w, src2.w);
-            }
-        };
-        template <> struct OpUnroller<8>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.a0 = op(src.a0);
-                if (mask(y, x_shifted + 1))
-                    dst.a1 = op(src.a1);
-                if (mask(y, x_shifted + 2))
-                    dst.a2 = op(src.a2);
-                if (mask(y, x_shifted + 3))
-                    dst.a3 = op(src.a3);
-                if (mask(y, x_shifted + 4))
-                    dst.a4 = op(src.a4);
-                if (mask(y, x_shifted + 5))
-                    dst.a5 = op(src.a5);
-                if (mask(y, x_shifted + 6))
-                    dst.a6 = op(src.a6);
-                if (mask(y, x_shifted + 7))
-                    dst.a7 = op(src.a7);
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
-            {
-                if (mask(y, x_shifted))
-                    dst.a0 = op(src1.a0, src2.a0);
-                if (mask(y, x_shifted + 1))
-                    dst.a1 = op(src1.a1, src2.a1);
-                if (mask(y, x_shifted + 2))
-                    dst.a2 = op(src1.a2, src2.a2);
-                if (mask(y, x_shifted + 3))
-                    dst.a3 = op(src1.a3, src2.a3);
-                if (mask(y, x_shifted + 4))
-                    dst.a4 = op(src1.a4, src2.a4);
-                if (mask(y, x_shifted + 5))
-                    dst.a5 = op(src1.a5, src2.a5);
-                if (mask(y, x_shifted + 6))
-                    dst.a6 = op(src1.a6, src2.a6);
-                if (mask(y, x_shifted + 7))
-                    dst.a7 = op(src1.a7, src2.a7);
-            }
-        };
+    //! Transform kernels

+    template <int shift> struct OpUnroller;
+    template <> struct OpUnroller<1>
+    {
        template <typename T, typename D, typename UnOp, typename Mask>
-        __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
        {
-            typedef TransformFunctorTraits<UnOp> ft;
-            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
-            typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
-
-            const int x = threadIdx.x + blockIdx.x * blockDim.x;
-            const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * ft::smart_shift;
-
-            if (y < src_.rows)
-            {
-                const T* src = src_.ptr(y);
-                D* dst = dst_.ptr(y);
-
-                if (x_shifted + ft::smart_shift - 1 < src_.cols)
-                {
-                    const read_type src_n_el = ((const read_type*)src)[x];
-                    write_type dst_n_el;
-
-                    OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                    ((write_type*)dst)[x] = dst_n_el;
-                }
-                else
-                {
-                    for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
-                    {
-                        if (mask(y, real_x))
-                            dst[real_x] = op(src[real_x]);
-                    }
-                }
-            }
-        }
-
-        template <typename T, typename D, typename UnOp, typename Mask>
-        static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
-        {
-		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < src.cols && y < src.rows && mask(y, x))
-            {
-                dst.ptr(y)[x] = op(src.ptr(y)[x]);
-            }
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 
-            const Mask mask, const BinOp op)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
        {
-            typedef TransformFunctorTraits<BinOp> ft;
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
-            typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
-
-            const int x = threadIdx.x + blockIdx.x * blockDim.x;
-            const int y = threadIdx.y + blockIdx.y * blockDim.y;
-            const int x_shifted = x * ft::smart_shift;
-
-            if (y < src1_.rows)
-            {
-                const T1* src1 = src1_.ptr(y);
-                const T2* src2 = src2_.ptr(y);
-                D* dst = dst_.ptr(y);
-
-                if (x_shifted + ft::smart_shift - 1 < src1_.cols)
-                {
-                    const read_type1 src1_n_el = ((const read_type1*)src1)[x];
-                    const read_type2 src2_n_el = ((const read_type2*)src2)[x];
-                    write_type dst_n_el;
-                    
-                    OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
-
-                    ((write_type*)dst)[x] = dst_n_el;
-                }
-                else
-                {
-                    for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
-                    {
-                        if (mask(y, real_x))
-                            dst[real_x] = op(src1[real_x], src2[real_x]);
-                    }
-                }
-            }
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
        }
-
-        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 
-            const Mask mask, const BinOp op)
-        {
-		    const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		    const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-            if (x < src1.cols && y < src1.rows && mask(y, x))
-            {
-                const T1 src1_data = src1.ptr(y)[x];
-                const T2 src2_data = src2.ptr(y)[x];
-                dst.ptr(y)[x] = op(src1_data, src2_data);
-            }
-        }
-
-        template <bool UseSmart> struct TransformDispatcher;
-        template<> struct TransformDispatcher<false>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<UnOp> ft;
-
-                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
-                const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     
-
-                transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() ); 
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<BinOp> ft;
-
-                const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
-                const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     
-
-                transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );            
-            }
-        };
-        template<> struct TransformDispatcher<true>
-        {
-            template <typename T, typename D, typename UnOp, typename Mask>
-            static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<UnOp> ft;
-
-                StaticAssert<ft::smart_shift != 1>::check();
-
-                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
-                const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      
-
-                transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );
-            }
-
-            template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-            static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
-            {
-                typedef TransformFunctorTraits<BinOp> ft;
-
-                StaticAssert<ft::smart_shift != 1>::check();
-
-                const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
-                const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    
-
-                transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
-                cudaSafeCall( cudaGetLastError() );
-
-                if (stream == 0)
-                    cudaSafeCall( cudaDeviceSynchronize() );            
-            }
-        };        
-
+    };
+    template <> struct OpUnroller<2>
+    {
        template <typename T, typename D, typename UnOp, typename Mask>
-        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
        {
-            typedef TransformFunctorTraits<UnOp> ft;
-            TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
        {
-            typedef TransformFunctorTraits<BinOp> ft;
-            TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+        }
+    };
+    template <> struct OpUnroller<3>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src.z);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src1.z, src2.z);
+        }
+    };
+    template <> struct OpUnroller<4>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src.z);
+            if (mask(y, x_shifted + 3))
+                dst.w = op(src.w);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.x = op(src1.x, src2.x);
+            if (mask(y, x_shifted + 1))
+                dst.y = op(src1.y, src2.y);
+            if (mask(y, x_shifted + 2))
+                dst.z = op(src1.z, src2.z);
+            if (mask(y, x_shifted + 3))
+                dst.w = op(src1.w, src2.w);
+        }
+    };
+    template <> struct OpUnroller<8>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.a0 = op(src.a0);
+            if (mask(y, x_shifted + 1))
+                dst.a1 = op(src.a1);
+            if (mask(y, x_shifted + 2))
+                dst.a2 = op(src.a2);
+            if (mask(y, x_shifted + 3))
+                dst.a3 = op(src.a3);
+            if (mask(y, x_shifted + 4))
+                dst.a4 = op(src.a4);
+            if (mask(y, x_shifted + 5))
+                dst.a5 = op(src.a5);
+            if (mask(y, x_shifted + 6))
+                dst.a6 = op(src.a6);
+            if (mask(y, x_shifted + 7))
+                dst.a7 = op(src.a7);
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y)
+        {
+            if (mask(y, x_shifted))
+                dst.a0 = op(src1.a0, src2.a0);
+            if (mask(y, x_shifted + 1))
+                dst.a1 = op(src1.a1, src2.a1);
+            if (mask(y, x_shifted + 2))
+                dst.a2 = op(src1.a2, src2.a2);
+            if (mask(y, x_shifted + 3))
+                dst.a3 = op(src1.a3, src2.a3);
+            if (mask(y, x_shifted + 4))
+                dst.a4 = op(src1.a4, src2.a4);
+            if (mask(y, x_shifted + 5))
+                dst.a5 = op(src1.a5, src2.a5);
+            if (mask(y, x_shifted + 6))
+                dst.a6 = op(src1.a6, src2.a6);
+            if (mask(y, x_shifted + 7))
+                dst.a7 = op(src1.a7, src2.a7);
+        }
+    };
+
+    template <typename T, typename D, typename UnOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T> src_, PtrStep<D> dst_, const Mask mask, const UnOp op)
+    {
+        typedef TransformFunctorTraits<UnOp> ft;
+        typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::read_type read_type;
+        typedef typename UnaryReadWriteTraits<T, D, ft::smart_shift>::write_type write_type;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * ft::smart_shift;
+
+        if (y < src_.rows)
+        {
+            const T* src = src_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + ft::smart_shift - 1 < src_.cols)
+            {
+                const read_type src_n_el = ((const read_type*)src)[x];
+                write_type dst_n_el;
+
+                OpUnroller<ft::smart_shift>::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src[real_x]);
+                }
+            }
        }
    }
-}}}
+
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static __global__ void transformSimple(const DevMem2D_<T> src, PtrStep<D> dst, const Mask mask, const UnOp op)
+    {
+	    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < src.cols && y < src.rows && mask(y, x))
+        {
+            dst.ptr(y)[x] = op(src.ptr(y)[x]);
+        }
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    __global__ static void transformSmart(const DevMem2D_<T1> src1_, const PtrStep<T2> src2_, PtrStep<D> dst_, 
+        const Mask mask, const BinOp op)
+    {
+        typedef TransformFunctorTraits<BinOp> ft;
+        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type1 read_type1;
+        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::read_type2 read_type2;
+        typedef typename BinaryReadWriteTraits<T1, T2, D, ft::smart_shift>::write_type write_type;
+
+        const int x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int y = threadIdx.y + blockIdx.y * blockDim.y;
+        const int x_shifted = x * ft::smart_shift;
+
+        if (y < src1_.rows)
+        {
+            const T1* src1 = src1_.ptr(y);
+            const T2* src2 = src2_.ptr(y);
+            D* dst = dst_.ptr(y);
+
+            if (x_shifted + ft::smart_shift - 1 < src1_.cols)
+            {
+                const read_type1 src1_n_el = ((const read_type1*)src1)[x];
+                const read_type2 src2_n_el = ((const read_type2*)src2)[x];
+                write_type dst_n_el;
+                
+                OpUnroller<ft::smart_shift>::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y);
+
+                ((write_type*)dst)[x] = dst_n_el;
+            }
+            else
+            {
+                for (int real_x = x_shifted; real_x < src1_.cols; ++real_x)
+                {
+                    if (mask(y, real_x))
+                        dst[real_x] = op(src1[real_x], src2[real_x]);
+                }
+            }
+        }
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static __global__ void transformSimple(const DevMem2D_<T1> src1, const PtrStep<T2> src2, PtrStep<D> dst, 
+        const Mask mask, const BinOp op)
+    {
+	    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+	    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x < src1.cols && y < src1.rows && mask(y, x))
+        {
+            const T1 src1_data = src1.ptr(y)[x];
+            const T2 src2_data = src2.ptr(y)[x];
+            dst.ptr(y)[x] = op(src1_data, src2_data);
+        }
+    }
+
+    template <bool UseSmart> struct TransformDispatcher;
+    template<> struct TransformDispatcher<false>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+
+            const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+            const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1);     
+
+            transformSimple<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() ); 
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+
+            const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1);
+            const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1);     
+
+            transformSimple<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );            
+        }
+    };
+    template<> struct TransformDispatcher<true>
+    {
+        template <typename T, typename D, typename UnOp, typename Mask>
+        static void call(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
+        {
+            typedef TransformFunctorTraits<UnOp> ft;
+
+            StaticAssert<ft::smart_shift != 1>::check();
+
+            const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+            const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1);      
+
+            transformSmart<T, D><<<grid, threads, 0, stream>>>(src, dst, mask, op);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+        static void call(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
+        {
+            typedef TransformFunctorTraits<BinOp> ft;
+
+            StaticAssert<ft::smart_shift != 1>::check();
+
+            const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1);
+            const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1);    
+
+            transformSmart<T1, T2, D><<<grid, threads, 0, stream>>>(src1, src2, dst, mask, op);
+            cudaSafeCall( cudaGetLastError() );
+
+            if (stream == 0)
+                cudaSafeCall( cudaDeviceSynchronize() );            
+        }
+    };        
+
+    template <typename T, typename D, typename UnOp, typename Mask>
+    static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<UnOp> ft;
+        TransformDispatcher<VecTraits<T>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src, dst, op, mask, stream);
+    }
+
+    template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
+    static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, const Mask& mask, cudaStream_t stream)
+    {
+        typedef TransformFunctorTraits<BinOp> ft;
+        TransformDispatcher<VecTraits<T1>::cn == 1 && VecTraits<T2>::cn == 1 && VecTraits<D>::cn == 1 && ft::smart_shift != 1>::call(src1, src2, dst, op, mask, stream);
+    }
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
@ -43,144 +43,146 @@
 #ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
 #define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__

+#include "internal_shared.hpp"
 #include "../vec_traits.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace detail
 {
-    namespace detail
+    template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
+    template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+
+    template <typename T> struct IsSignedIntergral { enum {value = 0}; };
+    template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<short> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<int> { enum {value = 1}; };
+    template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+
+    template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
+    template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
+    template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+
+    template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
+    template <> struct IsIntegral<char> { enum {value = 1}; };
+    template <> struct IsIntegral<bool> { enum {value = 1}; };
+
+    template <typename T> struct IsFloat { enum {value = 0}; };
+    template <> struct IsFloat<float> { enum {value = 1}; };
+    template <> struct IsFloat<double> { enum {value = 1}; };
+
+    template <typename T> struct IsVec { enum {value = 0}; };
+    template <> struct IsVec<uchar1> { enum {value = 1}; };
+    template <> struct IsVec<uchar2> { enum {value = 1}; };
+    template <> struct IsVec<uchar3> { enum {value = 1}; };
+    template <> struct IsVec<uchar4> { enum {value = 1}; };
+    template <> struct IsVec<uchar8> { enum {value = 1}; };
+    template <> struct IsVec<char1> { enum {value = 1}; };
+    template <> struct IsVec<char2> { enum {value = 1}; };
+    template <> struct IsVec<char3> { enum {value = 1}; };
+    template <> struct IsVec<char4> { enum {value = 1}; };
+    template <> struct IsVec<char8> { enum {value = 1}; };
+    template <> struct IsVec<ushort1> { enum {value = 1}; };
+    template <> struct IsVec<ushort2> { enum {value = 1}; };
+    template <> struct IsVec<ushort3> { enum {value = 1}; };
+    template <> struct IsVec<ushort4> { enum {value = 1}; };
+    template <> struct IsVec<ushort8> { enum {value = 1}; };
+    template <> struct IsVec<short1> { enum {value = 1}; };
+    template <> struct IsVec<short2> { enum {value = 1}; };
+    template <> struct IsVec<short3> { enum {value = 1}; };
+    template <> struct IsVec<short4> { enum {value = 1}; };
+    template <> struct IsVec<short8> { enum {value = 1}; };
+    template <> struct IsVec<uint1> { enum {value = 1}; };
+    template <> struct IsVec<uint2> { enum {value = 1}; };
+    template <> struct IsVec<uint3> { enum {value = 1}; };
+    template <> struct IsVec<uint4> { enum {value = 1}; };
+    template <> struct IsVec<uint8> { enum {value = 1}; };
+    template <> struct IsVec<int1> { enum {value = 1}; };
+    template <> struct IsVec<int2> { enum {value = 1}; };
+    template <> struct IsVec<int3> { enum {value = 1}; };
+    template <> struct IsVec<int4> { enum {value = 1}; };
+    template <> struct IsVec<int8> { enum {value = 1}; };
+    template <> struct IsVec<float1> { enum {value = 1}; };
+    template <> struct IsVec<float2> { enum {value = 1}; };
+    template <> struct IsVec<float3> { enum {value = 1}; };
+    template <> struct IsVec<float4> { enum {value = 1}; };
+    template <> struct IsVec<float8> { enum {value = 1}; };
+    template <> struct IsVec<double1> { enum {value = 1}; };
+    template <> struct IsVec<double2> { enum {value = 1}; };
+    template <> struct IsVec<double3> { enum {value = 1}; };
+    template <> struct IsVec<double4> { enum {value = 1}; };
+    template <> struct IsVec<double8> { enum {value = 1}; };
+
+    template <class U> struct AddParameterType { typedef const U& type; };
+    template <class U> struct AddParameterType<U&> { typedef U& type; };
+    template <> struct AddParameterType<void> { typedef void type; };
+
+    template <class U> struct ReferenceTraits 
    {
-        template <bool, typename T1, typename T2> struct Select { typedef T1 type; };
-        template <typename T1, typename T2> struct Select<false, T1, T2> { typedef T2 type; };
+        enum { value = false };
+        typedef U type;
+    };        
+    template <class U> struct ReferenceTraits<U&>
+    {
+        enum { value = true };
+        typedef U type;
+    };
           
-        template <typename T> struct IsSignedIntergral { enum {value = 0}; };
-        template <> struct IsSignedIntergral<schar> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<char1> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<short> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<short1> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<int> { enum {value = 1}; };
-        template <> struct IsSignedIntergral<int1> { enum {value = 1}; };
+    template <class U> struct PointerTraits
+    {
+        enum { value = false };
+        typedef void type;
+    };        
+    template <class U> struct PointerTraits<U*>
+    {
+        enum { value = true };
+        typedef U type;
+    };        
+    template <class U> struct PointerTraits<U*&>
+    {
+        enum { value = true };
+        typedef U type;
+    };
     
-        template <typename T> struct IsUnsignedIntegral { enum {value = 0}; };
-        template <> struct IsUnsignedIntegral<uchar> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<uchar1> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<ushort> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<ushort1> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<uint> { enum {value = 1}; };
-        template <> struct IsUnsignedIntegral<uint1> { enum {value = 1}; };
+    template <class U> struct UnConst
+    {
+        typedef U type;
+        enum { value = 0 };
+    };        
+    template <class U> struct UnConst<const U>
+    {
+        typedef U type;
+        enum { value = 1 };
+    };
+    template <class U> struct UnConst<const U&>
+    {
+        typedef U& type;
+        enum { value = 1 };
+    };

-        template <typename T> struct IsIntegral { enum {value = IsSignedIntergral<T>::value || IsUnsignedIntegral<T>::value}; };
-        template <> struct IsIntegral<char> { enum {value = 1}; };
-        template <> struct IsIntegral<bool> { enum {value = 1}; };
+    template <class U> struct UnVolatile
+    {
+        typedef U type;
+        enum { value = 0 };
+    };       
+    template <class U> struct UnVolatile<volatile U>
+    {
+        typedef U type;
+        enum { value = 1 };
+    };
+    template <class U> struct UnVolatile<volatile U&>
+    {
+        typedef U& type;
+        enum { value = 1 };
+    };
+}

-        template <typename T> struct IsFloat { enum {value = 0}; };
-        template <> struct IsFloat<float> { enum {value = 1}; };
-        template <> struct IsFloat<double> { enum {value = 1}; };
-
-        template <typename T> struct IsVec { enum {value = 0}; };
-        template <> struct IsVec<uchar1> { enum {value = 1}; };
-        template <> struct IsVec<uchar2> { enum {value = 1}; };
-        template <> struct IsVec<uchar3> { enum {value = 1}; };
-        template <> struct IsVec<uchar4> { enum {value = 1}; };
-        template <> struct IsVec<uchar8> { enum {value = 1}; };
-        template <> struct IsVec<char1> { enum {value = 1}; };
-        template <> struct IsVec<char2> { enum {value = 1}; };
-        template <> struct IsVec<char3> { enum {value = 1}; };
-        template <> struct IsVec<char4> { enum {value = 1}; };
-        template <> struct IsVec<char8> { enum {value = 1}; };
-        template <> struct IsVec<ushort1> { enum {value = 1}; };
-        template <> struct IsVec<ushort2> { enum {value = 1}; };
-        template <> struct IsVec<ushort3> { enum {value = 1}; };
-        template <> struct IsVec<ushort4> { enum {value = 1}; };
-        template <> struct IsVec<ushort8> { enum {value = 1}; };
-        template <> struct IsVec<short1> { enum {value = 1}; };
-        template <> struct IsVec<short2> { enum {value = 1}; };
-        template <> struct IsVec<short3> { enum {value = 1}; };
-        template <> struct IsVec<short4> { enum {value = 1}; };
-        template <> struct IsVec<short8> { enum {value = 1}; };
-        template <> struct IsVec<uint1> { enum {value = 1}; };
-        template <> struct IsVec<uint2> { enum {value = 1}; };
-        template <> struct IsVec<uint3> { enum {value = 1}; };
-        template <> struct IsVec<uint4> { enum {value = 1}; };
-        template <> struct IsVec<uint8> { enum {value = 1}; };
-        template <> struct IsVec<int1> { enum {value = 1}; };
-        template <> struct IsVec<int2> { enum {value = 1}; };
-        template <> struct IsVec<int3> { enum {value = 1}; };
-        template <> struct IsVec<int4> { enum {value = 1}; };
-        template <> struct IsVec<int8> { enum {value = 1}; };
-        template <> struct IsVec<float1> { enum {value = 1}; };
-        template <> struct IsVec<float2> { enum {value = 1}; };
-        template <> struct IsVec<float3> { enum {value = 1}; };
-        template <> struct IsVec<float4> { enum {value = 1}; };
-        template <> struct IsVec<float8> { enum {value = 1}; };
-        template <> struct IsVec<double1> { enum {value = 1}; };
-        template <> struct IsVec<double2> { enum {value = 1}; };
-        template <> struct IsVec<double3> { enum {value = 1}; };
-        template <> struct IsVec<double4> { enum {value = 1}; };
-        template <> struct IsVec<double8> { enum {value = 1}; };
-
-        template <class U> struct AddParameterType { typedef const U& type; };
-        template <class U> struct AddParameterType<U&> { typedef U& type; };
-        template <> struct AddParameterType<void> { typedef void type; };
-
-        template <class U> struct ReferenceTraits 
-        {
-            enum { value = false };
-            typedef U type;
-        };        
-        template <class U> struct ReferenceTraits<U&>
-        {
-            enum { value = true };
-            typedef U type;
-        };
-               
-        template <class U> struct PointerTraits
-        {
-            enum { value = false };
-            typedef void type;
-        };        
-        template <class U> struct PointerTraits<U*>
-        {
-            enum { value = true };
-            typedef U type;
-        };        
-        template <class U> struct PointerTraits<U*&>
-        {
-            enum { value = true };
-            typedef U type;
-        };
-         
-        template <class U> struct UnConst
-        {
-            typedef U type;
-            enum { value = 0 };
-        };        
-        template <class U> struct UnConst<const U>
-        {
-            typedef U type;
-            enum { value = 1 };
-        };
-        template <class U> struct UnConst<const U&>
-        {
-            typedef U& type;
-            enum { value = 1 };
-        };
-
-        template <class U> struct UnVolatile
-        {
-            typedef U type;
-            enum { value = 0 };
-        };       
-        template <class U> struct UnVolatile<volatile U>
-        {
-            typedef U type;
-            enum { value = 1 };
-        };
-        template <class U> struct UnVolatile<volatile U&>
-        {
-            typedef U& type;
-            enum { value = 1 };
-        };
-    }
-}}}
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
@ -43,75 +43,77 @@
 #ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
 #define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__

+#include "internal_shared.hpp"
 #include "../datamov_utils.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace detail
 {
-    namespace detail
+    template <int THREAD_DIM, int N> struct UnrollVecDiffCached
    {
-        template <int THREAD_DIM, int N> struct UnrollVecDiffCached
+        template <typename Dist, typename T1, typename T2>
+        static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcCheck(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int ind)
-            {
-                if (ind < len)
-                {
-                    T1 val1 = *vecCached++;
-
-                    T2 val2;
-                    ForceGlob<T2>::Load(vecGlob, ind, val2);
-
-                    dist.reduceIter(val1, val2);
-
-                    UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
-                }
-            }
-
-            template <typename Dist, typename T1, typename T2>
-            static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+            if (ind < len)
            {
                T1 val1 = *vecCached++;

                T2 val2;
-                ForceGlob<T2>::Load(vecGlob, 0, val2);
-                vecGlob += THREAD_DIM;
+                ForceGlob<T2>::Load(vecGlob, ind, val2);

                dist.reduceIter(val1, val2);

-                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
-            }
-        };
-        template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
-        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
-            {
+                UnrollVecDiffCached<THREAD_DIM, N - 1>::calcCheck(vecCached, vecGlob, len, dist, ind + THREAD_DIM);
            }
+        }

-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
-            {
-            }
-        };
+        template <typename Dist, typename T1, typename T2>
+        static __device__ void calcWithoutCheck(const T1* vecCached, const T2* vecGlob, Dist& dist)
+        {
+            T1 val1 = *vecCached++;

-        template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+            T2 val2;
+            ForceGlob<T2>::Load(vecGlob, 0, val2);
+            vecGlob += THREAD_DIM;
+
+            dist.reduceIter(val1, val2);
+
+            UnrollVecDiffCached<THREAD_DIM, N - 1>::calcWithoutCheck(vecCached, vecGlob, dist);
+        }
+    };
+    template <int THREAD_DIM> struct UnrollVecDiffCached<THREAD_DIM, 0>
+    {
+        template <typename Dist, typename T1, typename T2>
+        static __device__ __forceinline__ void calcCheck(const T1*, const T2*, int, Dist&, int)
        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
-            }
-        };
-        template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+        }
+
+        template <typename Dist, typename T1, typename T2>
+        static __device__ __forceinline__ void calcWithoutCheck(const T1*, const T2*, Dist&)
        {
-            template <typename Dist, typename T1, typename T2>
-            static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
-            {
-                UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
-            }
-        };
-    }
-}}}
+        }
+    };
+
+    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN> struct VecDiffCachedCalculator;
+    template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, false>
+    {
+        template <typename Dist, typename T1, typename T2>
+        static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+        {
+            UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcCheck(vecCached, vecGlob, len, dist, tid);
+        }
+    };
+    template <int THREAD_DIM, int MAX_LEN> struct VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, true>
+    {
+        template <typename Dist, typename T1, typename T2>
+        static __device__ __forceinline__ void calc(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, int tid)
+        {
+            UnrollVecDiffCached<THREAD_DIM, MAX_LEN / THREAD_DIM>::calcWithoutCheck(vecCached, vecGlob + tid, dist);
+        }
+    };
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@ -43,38 +43,41 @@
 #ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
 #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__

-namespace cv { namespace gpu { namespace device
+#include "internal_shared.hpp"
+
+BEGIN_OPENCV_DEVICE_NAMESPACE
+   
+template<class T> struct DynamicSharedMem
 {
-    template<class T> struct DynamicSharedMem
+    __device__ __forceinline__ operator T*()
    {
-        __device__ __forceinline__ operator T*()
-        {
-            extern __shared__ int __smem[];
-            return (T*)__smem;
-        }
+        extern __shared__ int __smem[];
+        return (T*)__smem;
+    }

-        __device__ __forceinline__ operator const T*() const
-        {
-            extern __shared__ int __smem[];
-            return (T*)__smem;
-        }
-    };
-
-    // specialize for double to avoid unaligned memory access compile errors
-    template<> struct DynamicSharedMem<double>
+    __device__ __forceinline__ operator const T*() const
    {
-        __device__ __forceinline__ operator double*()
-        {
-            extern __shared__ double __smem_d[];
-            return (double*)__smem_d;
-        }
+        extern __shared__ int __smem[];
+        return (T*)__smem;
+    }
+};

-        __device__ __forceinline__ operator const double*() const
-        {
-            extern __shared__ double __smem_d[];
-            return (double*)__smem_d;
-        }
-    };
-}}}
+// specialize for double to avoid unaligned memory access compile errors
+template<> struct DynamicSharedMem<double>
+{
+    __device__ __forceinline__ operator double*()
+    {
+        extern __shared__ double __smem_d[];
+        return (double*)__smem_d;
+    }
+
+    __device__ __forceinline__ operator const double*() const
+    {
+        extern __shared__ double __smem_d[];
+        return (double*)__smem_d;
+    }
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@ -43,27 +43,26 @@
 #ifndef OPENCV_GPU_EMULATION_HPP_
 #define OPENCV_GPU_EMULATION_HPP_

-#include "opencv2/gpu/device/warp_reduce.hpp"
+#include "internal_shared.hpp"
+#include "warp_reduce.hpp"

-namespace cv
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+struct Emulation
 {
-	namespace device
+	static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
 	{
-		struct Emulation
-		{
-			static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
-			{
 #if __CUDA_ARCH__ >= 200
-				(void)cta_buffer;
-				return __ballot(predicate);
+		(void)cta_buffer;
+		return __ballot(predicate);
 #else
-				int tid = threadIdx.x;				
-				cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
-				return warp_reduce(cta_buffer);
+		int tid = threadIdx.x;				
+		cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+		return warp_reduce(cta_buffer);
 #endif
-			}
-		};
 	}
-}
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif /* OPENCV_GPU_EMULATION_HPP_ */
--- a/modules/gpu/src/opencv2/gpu/device/filters.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/filters.hpp
@ -43,93 +43,95 @@
 #ifndef __OPENCV_GPU_FILTERS_HPP__
 #define __OPENCV_GPU_FILTERS_HPP__

+#include "internal_shared.hpp"
 #include "saturate_cast.hpp"
 #include "vec_traits.hpp"
 #include "vec_math.hpp"

-namespace cv {  namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename Ptr2D> struct PointFilter
 {
-    template <typename Ptr2D> struct PointFilter
+    typedef typename Ptr2D::elem_type elem_type;
+    typedef float index_type;
+
+    explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
+     
+    __device__ __forceinline__ elem_type operator ()(float y, float x) const
    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
+        return src(__float2int_rn(y), __float2int_rn(x));
+    }

-        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
+    const Ptr2D src;
+};

-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            return src(__float2int_rn(y), __float2int_rn(x));
-        }
+template <typename Ptr2D> struct LinearFilter
+{
+    typedef typename Ptr2D::elem_type elem_type;
+    typedef float index_type;

-        const Ptr2D src;
-    };
+    explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}

-    template <typename Ptr2D> struct LinearFilter
+    __device__ __forceinline__ elem_type operator ()(float y, float x) const
    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
-
-        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
-
-        __device__ __forceinline__ elem_type operator ()(float y, float x) const
-        {
-            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
-
-            work_type out = VecTraits<work_type>::all(0);
-
-            const int x1 = __float2int_rd(x);
-            const int y1 = __float2int_rd(y);
-            const int x2 = x1 + 1;
-            const int y2 = y1 + 1;
-
-            elem_type src_reg = src(y1, x1);
-            out = out + src_reg * ((x2 - x) * (y2 - y));
-
-            src_reg = src(y1, x2);
-            out = out + src_reg * ((x - x1) * (y2 - y));
-
-            src_reg = src(y2, x1);
-            out = out + src_reg * ((x2 - x) * (y - y1));
-
-            src_reg = src(y2, x2);
-            out = out + src_reg * ((x - x1) * (y - y1));
-
-            return saturate_cast<elem_type>(out);
-        }
-
-        const Ptr2D src;
-    };
-    
-    template <typename Ptr2D> struct CubicFilter
-    {
-        typedef typename Ptr2D::elem_type elem_type;
-        typedef float index_type;
        typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;

-        explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
+        work_type out = VecTraits<work_type>::all(0);

-        static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) 
-        {
-	        return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
-        }
+        const int x1 = __float2int_rd(x);
+        const int y1 = __float2int_rd(y);
+        const int x2 = x1 + 1;
+        const int y2 = y1 + 1;

-        __device__ elem_type operator ()(float y, float x) const
-        {
-            const int xi = __float2int_rn(x);
-            const int yi = __float2int_rn(y);
+        elem_type src_reg = src(y1, x1);
+        out = out + src_reg * ((x2 - x) * (y2 - y));

-	        work_type arr[4];
+        src_reg = src(y1, x2);
+        out = out + src_reg * ((x - x1) * (y2 - y));

-	        arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
-	        arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);
-	        arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
-	        arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
+        src_reg = src(y2, x1);
+        out = out + src_reg * ((x2 - x) * (y - y1));

-	        return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
-        }
+        src_reg = src(y2, x2);
+        out = out + src_reg * ((x - x1) * (y - y1));

-        const Ptr2D src;
-    };
-}}}
+        return saturate_cast<elem_type>(out);
+    }
+
+    const Ptr2D src;
+};
+
+template <typename Ptr2D> struct CubicFilter
+{
+    typedef typename Ptr2D::elem_type elem_type;
+    typedef float index_type;
+    typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+
+    explicit __host__ __device__ __forceinline__ CubicFilter(const Ptr2D& src_) : src(src_) {}
+    
+    static __device__ __forceinline__ work_type cubicInterpolate(const work_type& p0, const work_type& p1, const work_type& p2, const work_type& p3, float x) 
+    {
+        return p1 + 0.5f * x * (p2 - p0 + x * (2.0f * p0 - 5.0f * p1 + 4.0f * p2 - p3 + x * (3.0f * (p1 - p2) + p3 - p0)));
+    }
+
+    __device__ elem_type operator ()(float y, float x) const
+    {
+        const int xi = __float2int_rn(x);
+        const int yi = __float2int_rn(y);
+        
+        work_type arr[4];
+        
+        arr[0] = cubicInterpolate(saturate_cast<work_type>(src(yi - 1, xi - 1)), saturate_cast<work_type>(src(yi - 1, xi)), saturate_cast<work_type>(src(yi - 1, xi + 1)), saturate_cast<work_type>(src(yi - 1, xi + 2)), x - xi);
+        arr[1] = cubicInterpolate(saturate_cast<work_type>(src(yi    , xi - 1)), saturate_cast<work_type>(src(yi    , xi)), saturate_cast<work_type>(src(yi    , xi + 1)), saturate_cast<work_type>(src(yi    , xi + 2)), x - xi);
+        arr[2] = cubicInterpolate(saturate_cast<work_type>(src(yi + 1, xi - 1)), saturate_cast<work_type>(src(yi + 1, xi)), saturate_cast<work_type>(src(yi + 1, xi + 1)), saturate_cast<work_type>(src(yi + 1, xi + 2)), x - xi);
+        arr[3] = cubicInterpolate(saturate_cast<work_type>(src(yi + 2, xi - 1)), saturate_cast<work_type>(src(yi + 2, xi)), saturate_cast<work_type>(src(yi + 2, xi + 1)), saturate_cast<work_type>(src(yi + 2, xi + 2)), x - xi);
+        
+        return saturate_cast<elem_type>(cubicInterpolate(arr[0], arr[1], arr[2], arr[3], y - yi));
+    }
+
+    const Ptr2D src;
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_FILTERS_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
@ -44,35 +44,31 @@
 #ifndef __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
 #define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_

-#include<cstdio>
+#include <cstdio>
+#include "internal_shared.hpp"

-namespace cv
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template<class Func> 
+void printFuncAttrib(Func& func)
 {
-    namespace gpu
-    {
-        namespace device
-        {
-            template<class Func> 
-            void printFuncAttrib(Func& func)
-            {

-                cudaFuncAttributes attrs;
-                cudaFuncGetAttributes(&attrs, func);  
+    cudaFuncAttributes attrs;
+    cudaFuncGetAttributes(&attrs, func);  

-                printf("=== Function stats ===\n");
-                printf("Name: \n");
-                printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
-                printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
-                printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
-                printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
-                printf("numRegs            = %d\n", attrs.numRegs);
-                printf("ptxVersion         = %d\n", attrs.ptxVersion);
-                printf("binaryVersion      = %d\n", attrs.binaryVersion);
-                printf("\n");
-                fflush(stdout); 
-            }
-        }
-    }
+    printf("=== Function stats ===\n");
+    printf("Name: \n");
+    printf("sharedSizeBytes    = %d\n", attrs.sharedSizeBytes);
+    printf("constSizeBytes     = %d\n", attrs.constSizeBytes);
+    printf("localSizeBytes     = %d\n", attrs.localSizeBytes);
+    printf("maxThreadsPerBlock = %d\n", attrs.maxThreadsPerBlock);
+    printf("numRegs            = %d\n", attrs.numRegs);
+    printf("ptxVersion         = %d\n", attrs.ptxVersion);
+    printf("binaryVersion      = %d\n", attrs.binaryVersion);
+    printf("\n");
+    fflush(stdout); 
 }

+END_OPENCV_DEVICE_NAMESPACE
+
 #endif  /* __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ */
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@ -49,182 +49,182 @@
 #include "vec_traits.hpp"
 #include "type_traits.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+// Function Objects
+
+using thrust::unary_function;
+using thrust::binary_function;
+
+// Arithmetic Operations
+
+template <typename T> struct plus : binary_function<T, T, T>
 {
-    // Function Objects
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a + b;
+    }
+};
+template <typename T> struct minus : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a - b;
+    }
+};
+template <typename T> struct multiplies : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a * b;
+    }
+};
+template <typename T> struct divides : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a / b;
+    }
+};
+template <typename T> struct modulus : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a % b;
+    }
+};
+template <typename T> struct negate : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
+    {
+        return -a;
+    }
+};

-    using thrust::unary_function;
-    using thrust::binary_function;
+// Comparison Operations

-    // Arithmetic Operations
+template <typename T> struct equal_to : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a == b;
+    }
+};
+template <typename T> struct not_equal_to : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a != b;
+    }
+};
+template <typename T> struct greater : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a > b;
+    }
+};
+template <typename T> struct less : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a < b;
+    }
+};
+template <typename T> struct greater_equal : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a >= b;
+    }
+};
+template <typename T> struct less_equal : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a <= b;
+    }
+};

-    template <typename T> struct plus : binary_function<T, T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a + b;
-        }
-    };
-    template <typename T> struct minus : binary_function<T, T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a - b;
-        }
-    };
-    template <typename T> struct multiplies : binary_function<T, T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a * b;
-        }
-    };
-    template <typename T> struct divides : binary_function<T, T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a / b;
-        }
-    };
-    template <typename T> struct modulus : binary_function<T, T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a % b;
-        }
-    };
-    template <typename T> struct negate : unary_function<T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a) const
-        {
-            return -a;
-        }
-    };
+// Logical Operations

-    // Comparison Operations
+template <typename T> struct logical_and : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a && b;
+    }
+};
+template <typename T> struct logical_or : binary_function<T, T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a || b;
+    }
+};
+template <typename T> struct logical_not : unary_function<T, bool>
+{
+    __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
+    {
+        return !a;
+    }
+};

-    template <typename T> struct equal_to : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a == b;
-        }
-    };
-    template <typename T> struct not_equal_to : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a != b;
-        }
-    };
-    template <typename T> struct greater : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a > b;
-        }
-    };
-    template <typename T> struct less : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a < b;
-        }
-    };
-    template <typename T> struct greater_equal : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a >= b;
-        }
-    };
-    template <typename T> struct less_equal : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a <= b;
-        }
-    };
+// Bitwise Operations

-    // Logical Operations
+template <typename T> struct bit_and : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a & b;
+    }
+};
+template <typename T> struct bit_or : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a | b;
+    }
+};
+template <typename T> struct bit_xor : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
+    {
+        return a ^ b;
+    }
+};
+template <typename T> struct bit_not : unary_function<T, T>
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
+    {
+        return ~v;
+    }
+};

-    template <typename T> struct logical_and : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a && b;
-        }
-    };
-    template <typename T> struct logical_or : binary_function<T, T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a || b;
-        }
-    };
-    template <typename T> struct logical_not : unary_function<T, bool>
-    {
-        __device__ __forceinline__ bool operator ()(typename TypeTraits<T>::ParameterType a) const
-        {
-            return !a;
-        }
-    };
+// Generalized Identity Operations

-    // Bitwise Operations
+template <typename T> struct identity : unary_function<T, T>
+{
+    __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
+    {
+        return x;
+    }
+};

-    template <typename T> struct bit_and : binary_function<T, T, T>
+template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
+{
+    __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a & b;
-        }
-    };
-    template <typename T> struct bit_or : binary_function<T, T, T>
+        return lhs;
+    }
+};
+template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
+{
+    __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a | b;
-        }
-    };
-    template <typename T> struct bit_xor : binary_function<T, T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType a, typename TypeTraits<T>::ParameterType b) const
-        {
-            return a ^ b;
-        }
-    };
-    template <typename T> struct bit_not : unary_function<T, T>
-    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
-        {
-            return ~v;
-        }
-    };
-
-    // Generalized Identity Operations
-
-    template <typename T> struct identity : unary_function<T, T>
-    {
-        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
-        {
-            return x;
-        }
-    };
-
-    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
-    {
-        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
-        {
-            return lhs;
-        }
-    };
-    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
-    {
-        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
-        {
-            return rhs;
-        }
-    };
+        return rhs;
+    }
+};

    // Min/Max Operations

@ -234,39 +234,39 @@ namespace cv { namespace gpu { namespace device
        __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
    };

-    template <typename T> struct maximum : binary_function<T, T, T>
+template <typename T> struct maximum : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
    {
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
-        {
-            return lhs < rhs ? rhs : lhs;
-        }
-    };
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, max)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, fmax)
-    OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, fmax)
+        return lhs < rhs ? rhs : lhs;
+    }
+};
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, schar, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, char, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, ushort, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, short, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, int, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uint, ::max)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, float, ::fmax)
+OPENCV_GPU_IMPLEMENT_MINMAX(maximum, double, ::fmax)

-    template <typename T> struct minimum : binary_function<T, T, T>
+template <typename T> struct minimum : binary_function<T, T, T>
+{
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
    {
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
-        {
-            return lhs < rhs ? lhs : rhs;
-        }
-    };
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, min)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, fmin)
-    OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, fmin)
+        return lhs < rhs ? lhs : rhs;
+    }
+};
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, schar, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, char, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, ushort, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, short, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, int, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uint, ::min)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, float, ::fmin)
+OPENCV_GPU_IMPLEMENT_MINMAX(minimum, double, ::fmin)

 #undef OPENCV_GPU_IMPLEMENT_MINMAX

@ -277,14 +277,14 @@ namespace cv { namespace gpu { namespace device
    { \
        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v) const \
        { \
-            return func ## f(v); \
+            return :: ## func ## f(v); \
        } \
    }; \
    template <> struct func ## _func<double> : unary_function<double, double> \
    { \
        __device__ __forceinline__ double operator ()(double v) const \
        { \
-            return func(v); \
+            return :: ## func(v); \
        } \
    };
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(func) \
@ -292,266 +292,270 @@ namespace cv { namespace gpu { namespace device
    { \
        __device__ __forceinline__ float operator ()(typename TypeTraits<T>::ParameterType v1, typename TypeTraits<T>::ParameterType v2) const \
        { \
-            return func ## f(v1, v2); \
+            return :: ## func ## f(v1, v2); \
        } \
    }; \
    template <> struct func ## _func<double> : binary_function<double, double, double> \
    { \
        __device__ __forceinline__ double operator ()(double v1, double v2) const \
        { \
-            return func(v1, v2); \
+            return :: ## func(v1, v2); \
        } \
    };

-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh)
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp10)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log2)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(log10)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sin)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cos)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tan)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asin)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acos)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atan)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sinh)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(cosh)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(tanh)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(asinh)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(acosh)
+OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(atanh)

-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot)
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2)
-    OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow)
+OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(hypot)
+OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(atan2)
+OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(pow)

 #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
 #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR

-    template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
+template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
+{
+    __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
-        {
-            return src1 * src1 + src2 * src2;
-        }
-    };
+        return src1 * src1 + src2 * src2;
+    }
+};

-    // Saturate Cast Functor
+// Saturate Cast Functor

-    template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
+template <typename T, typename D> struct saturate_cast_func : unary_function<T, D>
+{
+    __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
    {
-        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::ParameterType v) const
-        {
-            return saturate_cast<D>(v);
-        }
-    };
+        return saturate_cast<D>(v);
+    }
+};

-    // Threshold Functors
+// Threshold Functors

-    template <typename T> struct thresh_binary_func : unary_function<T, T>
+template <typename T> struct thresh_binary_func : unary_function<T, T>
+{
+    __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
+
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
    {
-        __host__ __device__ __forceinline__ thresh_binary_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
-
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
-        {
-            return (src > thresh) * maxVal;
-        }
-
-        const T thresh;
-        const T maxVal;
-    };
-    template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
-    {
-        __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
-
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
-        {
-            return (src <= thresh) * maxVal;
-        }
-
-        const T thresh;
-        const T maxVal;
-    };
-    template <typename T> struct thresh_trunc_func : unary_function<T, T>
-    {
-        explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
-        {
-            return minimum<T>()(src, thresh);
-        }
-
-        const T thresh;
-    };
-    template <typename T> struct thresh_to_zero_func : unary_function<T, T>
-    {
-        explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
-        {
-            return (src > thresh) * src;
-        }
-
-        const T thresh;
-    };
-    template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
-    {
-        explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}
-
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
-        {
-            return (src <= thresh) * src;
-        }
-
-        const T thresh;
-    };    
-
-    // Function Object Adaptors
-
-    template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
-    {
-      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
-
-      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
-      { 
-          return !pred(x); 
-      }
-
-      const Predicate pred;
-    };
-    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
-    {
-        return unary_negate<Predicate>(pred);
+        return (src > thresh) * maxVal;
    }

-    template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
-    {
-        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+    const T thresh;
+    const T maxVal;
+};
+template <typename T> struct thresh_binary_inv_func : unary_function<T, T>
+{
+    __host__ __device__ __forceinline__ thresh_binary_inv_func(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}

-        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
-        { 
-            return !pred(x,y); 
-        }
-
-        const Predicate pred;
-    };
-    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
    {
-        return binary_negate<BinaryPredicate>(pred);
+        return (src <= thresh) * maxVal;
    }

-    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
-    {
-        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+    const T thresh;
+    const T maxVal;
+};
+template <typename T> struct thresh_trunc_func : unary_function<T, T>
+{
+    explicit __host__ __device__ __forceinline__ thresh_trunc_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}

-        __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
-        {
-            return op(arg1, a);
-        }
-
-        const Op op;
-        const typename Op::first_argument_type arg1;
-    };
-    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
    {
-        return binder1st<Op>(op, typename Op::first_argument_type(x));
+        return minimum<T>()(src, thresh);
    }

-    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
-    {
-        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+    const T thresh;
+};
+template <typename T> struct thresh_to_zero_func : unary_function<T, T>
+{
+    explicit __host__ __device__ __forceinline__ thresh_to_zero_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}

-        __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
-        {
-            return op(a, arg2);
-        }
-
-        const Op op;
-        const typename Op::second_argument_type arg2;
-    };
-    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
    {
-        return binder2nd<Op>(op, typename Op::second_argument_type(x));
+        return (src > thresh) * src;
    }

-    // Functor Traits
+    const T thresh;
+};
+template <typename T> struct thresh_to_zero_inv_func : unary_function<T, T>
+{
+    explicit __host__ __device__ __forceinline__ thresh_to_zero_inv_func(T thresh_, T maxVal_ = 0) : thresh(thresh_) {}

-    template <typename F> struct IsUnaryFunction
+    __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType src) const
    {
-        typedef char Yes;
-        struct No {Yes a[2];};
-
-        template <typename T, typename D> static Yes check(unary_function<T, D>);
-        static No check(...);
-
-        static F makeF();
-
-        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
-    };
-
-    template <typename F> struct IsBinaryFunction
-    {
-        typedef char Yes;
-        struct No {Yes a[2];};
-
-        template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
-        static No check(...);
-
-        static F makeF();
-
-        enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
-    };
-
-    namespace detail
-    {
-        template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
-        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
-        template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
-
-        template <typename T, typename D> struct DefaultUnaryShift
-        {
-            enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
-        };
-        
-        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
-        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
-        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
-
-        template <typename T1, typename T2, typename D> struct DefaultBinaryShift
-        {
-            enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
-        };
-
-        template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
-        template <typename Func> struct ShiftDispatcher<Func, true>
-        {
-            enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
-        };
-        template <typename Func> struct ShiftDispatcher<Func, false>
-        {
-            enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
-        };
+        return (src <= thresh) * src;
    }

-    template <typename Func> struct DefaultTransformShift
+    const T thresh;
+};
+
+// Function Object Adaptors
+
+template <typename Predicate> struct unary_negate : unary_function<typename Predicate::argument_type, bool>
+{
+  explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
+
+  __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
+  { 
+      return !pred(x); 
+  }
+
+  const Predicate pred;
+};
+template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
+{
+    return unary_negate<Predicate>(pred);
+}
+
+template <typename Predicate> struct binary_negate : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>
+{
+    explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
+
+    __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
    { 
-        enum { shift = detail::ShiftDispatcher<Func>::shift };
+        return !pred(x,y); 
+    }
+
+    const Predicate pred;
+};
+template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
+{
+    return binary_negate<BinaryPredicate>(pred);
+}
+
+template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
+{
+    __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
+
+    __device__ __forceinline__ typename Op::result_type operator ()(typename TypeTraits<typename Op::second_argument_type>::ParameterType a) const
+    {
+        return op(arg1, a);
+    }
+
+    const Op op;
+    const typename Op::first_argument_type arg1;
+};
+template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
+{
+    return binder1st<Op>(op, typename Op::first_argument_type(x));
+}
+
+template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
+{
+    __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
+
+    __forceinline__ __device__ typename Op::result_type operator ()(typename TypeTraits<typename Op::first_argument_type>::ParameterType a) const
+    {
+        return op(a, arg2);
+    }
+
+    const Op op;
+    const typename Op::second_argument_type arg2;
+};
+template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
+{
+    return binder2nd<Op>(op, typename Op::second_argument_type(x));
+}
+
+// Functor Traits
+
+template <typename F> struct IsUnaryFunction
+{
+    typedef char Yes;
+    struct No {Yes a[2];};
+
+    template <typename T, typename D> static Yes check(unary_function<T, D>);
+    static No check(...);
+
+    static F makeF();
+
+    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+};
+
+template <typename F> struct IsBinaryFunction
+{
+    typedef char Yes;
+    struct No {Yes a[2];};
+
+    template <typename T1, typename T2, typename D> static Yes check(binary_function<T1, T2, D>);
+    static No check(...);
+
+    static F makeF();
+
+    enum { value = (sizeof(check(makeF())) == sizeof(Yes)) };
+};
+
+namespace detail
+{
+    template <size_t src_elem_size, size_t dst_elem_size> struct UnOpShift { enum { shift = 1 }; };
+    template <size_t src_elem_size> struct UnOpShift<src_elem_size, 1> { enum { shift = 4 }; };
+    template <size_t src_elem_size> struct UnOpShift<src_elem_size, 2> { enum { shift = 2 }; };
+
+    template <typename T, typename D> struct DefaultUnaryShift
+    {
+        enum { shift = detail::UnOpShift<sizeof(T), sizeof(D)>::shift };
    };
    
-    template <typename Func> struct DefaultTransformFunctorTraits
-    {
-        enum { simple_block_dim_x = 16 };
-        enum { simple_block_dim_y = 16 };
+    template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
+    template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
+    template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };

-        enum { smart_block_dim_x = 16 };
-        enum { smart_block_dim_y = 16 };
-        enum { smart_shift = DefaultTransformShift<Func>::shift };
+    template <typename T1, typename T2, typename D> struct DefaultBinaryShift
+    {
+        enum { shift = detail::BinOpShift<sizeof(T1), sizeof(T2), sizeof(D)>::shift };
    };

-    template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
-}}}
+    template <typename Func, bool unary = IsUnaryFunction<Func>::value> struct ShiftDispatcher;
+    template <typename Func> struct ShiftDispatcher<Func, true>
+    {
+        enum { shift = DefaultUnaryShift<typename Func::argument_type, typename Func::result_type>::shift };
+    };
+    template <typename Func> struct ShiftDispatcher<Func, false>
+    {
+        enum { shift = DefaultBinaryShift<typename Func::first_argument_type, typename Func::second_argument_type, typename Func::result_type>::shift };
+    };
+}
+
+template <typename Func> struct DefaultTransformShift
+{
+    enum { shift = detail::ShiftDispatcher<Func>::shift };
+};
+
+template <typename Func> struct DefaultTransformFunctorTraits
+{
+    enum { simple_block_dim_x = 16 };
+    enum { simple_block_dim_y = 16 };
+
+    enum { smart_block_dim_x = 16 };
+    enum { smart_block_dim_y = 16 };
+    enum { smart_shift = DefaultTransformShift<Func>::shift };
+};
+
+template <typename Func> struct TransformFunctorTraits : DefaultTransformFunctorTraits<Func> {};
+
+#define DEFINE_TRANSFORM_FUNCTOR_TRAITS(type) \
+    template <> struct TransformFunctorTraits< type > : DefaultTransformFunctorTraits< type >
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_FUNCTIONAL_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@ -43,190 +43,193 @@
 #ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
 #define __OPENCV_GPU_LIMITS_GPU_HPP__

-namespace cv { namespace gpu { namespace device
+#include "internal_shared.hpp"
+
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template<class T> struct numeric_limits
 {
-    template<class T> struct numeric_limits
-    {
-        typedef T type;
-        __device__ __forceinline__ static type min()  { return type(); };
-        __device__ __forceinline__ static type max() { return type(); };
-        __device__ __forceinline__ static type epsilon() { return type(); }
-        __device__ __forceinline__ static type round_error() { return type(); }
-        __device__ __forceinline__ static type denorm_min()  { return type(); }
-        __device__ __forceinline__ static type infinity() { return type(); }
-        __device__ __forceinline__ static type quiet_NaN() { return type(); }
-        __device__ __forceinline__ static type signaling_NaN() { return T(); }
-        static const bool is_signed;
-    };
+    typedef T type;
+    __device__ __forceinline__ static type min()  { return type(); };
+    __device__ __forceinline__ static type max() { return type(); };
+    __device__ __forceinline__ static type epsilon() { return type(); }
+    __device__ __forceinline__ static type round_error() { return type(); }
+    __device__ __forceinline__ static type denorm_min()  { return type(); }
+    __device__ __forceinline__ static type infinity() { return type(); }
+    __device__ __forceinline__ static type quiet_NaN() { return type(); }
+    __device__ __forceinline__ static type signaling_NaN() { return T(); }
+    static const bool is_signed;
+};

-    template<> struct numeric_limits<bool>
-    {
-        typedef bool type;
-        __device__ __forceinline__ static type min() { return false; };
-        __device__ __forceinline__ static type max() { return true;  };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template<> struct numeric_limits<bool>
+{
+    typedef bool type;
+    __device__ __forceinline__ static type min() { return false; };
+    __device__ __forceinline__ static type max() { return true;  };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (char)-1 == -1;
-    };
+template<> struct numeric_limits<char>
+{
+    typedef char type;
+    __device__ __forceinline__ static type min() { return CHAR_MIN; };
+    __device__ __forceinline__ static type max() { return CHAR_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = (char)-1 == -1;
+};

-     template<> struct numeric_limits<signed char>
-    {
-        typedef char type;
-        __device__ __forceinline__ static type min() { return CHAR_MIN; };
-        __device__ __forceinline__ static type max() { return CHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = (signed char)-1 == -1;
-    };
+ template<> struct numeric_limits<signed char>
+{
+    typedef char type;
+    __device__ __forceinline__ static type min() { return CHAR_MIN; };
+    __device__ __forceinline__ static type max() { return CHAR_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = (signed char)-1 == -1;
+};

-    template<> struct numeric_limits<unsigned char>
-    {
-        typedef unsigned char type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template<> struct numeric_limits<unsigned char>
+{
+    typedef unsigned char type;
+    __device__ __forceinline__ static type min() { return 0; };
+    __device__ __forceinline__ static type max() { return UCHAR_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<short>
-    {
-        typedef short type;
-        __device__ __forceinline__ static type min() { return SHRT_MIN; };
-        __device__ __forceinline__ static type max() { return SHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template<> struct numeric_limits<short>
+{
+    typedef short type;
+    __device__ __forceinline__ static type min() { return SHRT_MIN; };
+    __device__ __forceinline__ static type max() { return SHRT_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<unsigned short>
-    {
-        typedef unsigned short type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return USHRT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template<> struct numeric_limits<unsigned short>
+{
+    typedef unsigned short type;
+    __device__ __forceinline__ static type min() { return 0; };
+    __device__ __forceinline__ static type max() { return USHRT_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<int>
-    {
-        typedef int type;
-        __device__ __forceinline__ static type min() { return INT_MIN; };
-        __device__ __forceinline__ static type max() { return INT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template<> struct numeric_limits<int>
+{
+    typedef int type;
+    __device__ __forceinline__ static type min() { return INT_MIN; };
+    __device__ __forceinline__ static type max() { return INT_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = true;
+};


-    template<> struct numeric_limits<unsigned int>
-    {
-        typedef unsigned int type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return UINT_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template<> struct numeric_limits<unsigned int>
+{
+    typedef unsigned int type;
+    __device__ __forceinline__ static type min() { return 0; };
+    __device__ __forceinline__ static type max() { return UINT_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<long>
-    {
-        typedef long type;
-        __device__ __forceinline__ static type min() { return LONG_MIN; };
-        __device__ __forceinline__ static type max() { return LONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template<> struct numeric_limits<long>
+{
+    typedef long type;
+    __device__ __forceinline__ static type min() { return LONG_MIN; };
+    __device__ __forceinline__ static type max() { return LONG_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<unsigned long>
-    {
-        typedef unsigned long type;
-        __device__ __forceinline__ static type min() { return 0; };
-        __device__ __forceinline__ static type max() { return ULONG_MAX; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = false;
-    };
+template<> struct numeric_limits<unsigned long>
+{
+    typedef unsigned long type;
+    __device__ __forceinline__ static type min() { return 0; };
+    __device__ __forceinline__ static type max() { return ULONG_MAX; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = false;
+};

-    template<> struct numeric_limits<float>
-    {
-        typedef float type;
-        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
-        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
+template<> struct numeric_limits<float>
+{
+    typedef float type;
+    __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };
+    __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
+    __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = true;
+};

-    template<> struct numeric_limits<double>
-    {
-        typedef double type;
-        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
-        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-        __device__ __forceinline__ static type epsilon();
-        __device__ __forceinline__ static type round_error();
-        __device__ __forceinline__ static type denorm_min();
-        __device__ __forceinline__ static type infinity();
-        __device__ __forceinline__ static type quiet_NaN();
-        __device__ __forceinline__ static type signaling_NaN();
-        static const bool is_signed = true;
-    };
-}}}
+template<> struct numeric_limits<double>
+{
+    typedef double type;
+    __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };
+    __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
+    __device__ __forceinline__ static type epsilon();
+    __device__ __forceinline__ static type round_error();
+    __device__ __forceinline__ static type denorm_min();
+    __device__ __forceinline__ static type infinity();
+    __device__ __forceinline__ static type quiet_NaN();
+    __device__ __forceinline__ static type signaling_NaN();
+    static const bool is_signed = true;
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_LIMITS_GPU_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@ -45,122 +45,173 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
 { 
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
-    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
+    return (uchar) ::max((int)v, 0); 
+}
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
+{ 
+    return (uchar) ::min((uint)v, (uint)UCHAR_MAX); 
+}
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
+{ 
+    return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); 
+}
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
+{ 
+    return (uchar) ::min(v, (uint)UCHAR_MAX); 
+}
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
+{ 
+    return saturate_cast<uchar>((uint)v); 
+}

-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
-    { return (uchar)max((int)v, 0); }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    { return (uchar)min(v, (uint)UCHAR_MAX); }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-    { return saturate_cast<uchar>((uint)v); }
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
+{ 
+    int iv = __float2int_rn(v); 
+    return saturate_cast<uchar>(iv); 
+}
+template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
+{
+#if __CUDA_ARCH__ >= 130
+    int iv = __double2int_rn(v); 
+    return saturate_cast<uchar>(iv);
+#else
+    return saturate_cast<uchar>((float)v);
+#endif
+}

-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
-    { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
-    {
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
-    #else
-        return saturate_cast<uchar>((float)v);
-    #endif
-    }
+template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
+{ 
+    return (schar) ::min((int)v, SCHAR_MAX); 
+}
+template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
+{ 
+    return (schar) ::min((uint)v, (uint)SCHAR_MAX); 
+}
+template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
+{
+    return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+}
+template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
+{ 
+    return saturate_cast<schar>((int)v); 
+}
+template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
+{ 
+    return (schar) ::min(v, (uint)SCHAR_MAX); 
+}

-    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
-    { return (schar)min((int)v, SCHAR_MAX); }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
-    {
-        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
-                    v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-    }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
-    { return saturate_cast<schar>((int)v); }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
-    { return (schar)min(v, (uint)SCHAR_MAX); }
+template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
+{ 
+    int iv = __float2int_rn(v); 
+    return saturate_cast<schar>(iv); 
+}
+template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
+{             
+#if __CUDA_ARCH__ >= 130
+    int iv = __double2int_rn(v); 
+    return saturate_cast<schar>(iv);
+#else
+    return saturate_cast<schar>((float)v);
+#endif
+}

-    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
-    { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
-    {             
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
-    #else
-        return saturate_cast<schar>((float)v);
-    #endif
-    }
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
+{ 
+    return (ushort) ::max((int)v, 0); 
+}
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
+{ 
+    return (ushort) ::max((int)v, 0); 
+}
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
+{ 
+    return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); 
+}
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
+{ 
+    return (ushort) ::min(v, (uint)USHRT_MAX); 
+}
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
+{
+    int iv = __float2int_rn(v); 
+    return saturate_cast<ushort>(iv); 
+}
+template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
+{             
+#if __CUDA_ARCH__ >= 130
+    int iv = __double2int_rn(v); 
+    return saturate_cast<ushort>(iv);
+#else
+    return saturate_cast<ushort>((float)v);
+#endif
+}

-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
-    { return (ushort)max((int)v, 0); }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
-    { return (ushort)max((int)v, 0); }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
-    { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
-    { return (ushort)min(v, (uint)USHRT_MAX); }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
-    { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
-    {             
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
-    #else
-        return saturate_cast<ushort>((float)v);
-    #endif
-    }
+template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
+{ 
+    return (short) ::min((int)v, SHRT_MAX); 
+}
+template<> __device__ __forceinline__ short saturate_cast<short>(int v)
+{
+    return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
+}
+template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
+{ 
+    return (short) ::min(v, (uint)SHRT_MAX); 
+}
+template<> __device__ __forceinline__ short saturate_cast<short>(float v)
+{ 
+    int iv = __float2int_rn(v); 
+    return saturate_cast<short>(iv); 
+}
+template<> __device__ __forceinline__ short saturate_cast<short>(double v)
+{            
+#if __CUDA_ARCH__ >= 130
+    int iv = __double2int_rn(v); 
+    return saturate_cast<short>(iv);
+#else
+    return saturate_cast<short>((float)v);
+#endif
+}

-    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
-    { return (short)min((int)v, SHRT_MAX); }
-    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
-    {
-        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
-                v : v > 0 ? SHRT_MAX : SHRT_MIN);
-    }
-    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
-    { return (short)min(v, (uint)SHRT_MAX); }
-    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
-    { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
-    {            
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); return saturate_cast<short>(iv);
-    #else
-        return saturate_cast<short>((float)v);
-    #endif
-    }
+template<> __device__ __forceinline__ int saturate_cast<int>(float v) 
+{ 
+    return __float2int_rn(v); 
+}
+template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
+{
+#if __CUDA_ARCH__ >= 130 
+    return __double2int_rn(v);
+#else
+    return saturate_cast<int>((float)v);
+#endif
+}

-    template<> __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-    template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
-    {
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
-        return __double2int_rn(v);
-    #else
-        return saturate_cast<int>((float)v);
-    #endif
-    }
+template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
+{ 
+    return __float2uint_rn(v); 
+}
+template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
+{            
+#if __CUDA_ARCH__ >= 130
+    return __double2uint_rn(v);
+#else
+    return saturate_cast<uint>((float)v);
+#endif
+}

-    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
-    {            
-    #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-        return __double2uint_rn(v);
-    #else
-        return saturate_cast<uint>((float)v);
-    #endif
-    }
-}}}
+END_OPENCV_DEVICE_NAMESPACE

 #endif /* __OPENCV_GPU_SATURATE_CAST_HPP__ */
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@ -43,32 +43,34 @@
 #ifndef __OPENCV_GPU_TRANSFORM_HPP__
 #define __OPENCV_GPU_TRANSFORM_HPP__

-#include "detail/transform_detail.hpp"
+#include "internal_shared.hpp"
 #include "utility.hpp"
+#include "detail/transform_detail.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename T, typename D, typename UnOp>
+void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
 {
-    template <typename T, typename D, typename UnOp>
-    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const UnOp& op, cudaStream_t stream = 0)
-    {
-        detail::transform_caller(src, dst, op, WithOutMask(), stream);
-    }
-    template <typename T, typename D, typename UnOp>
-    void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
-    {
-        detail::transform_caller(src, dst, op, SingleMask(mask), stream);
-    }
+    detail::transform_caller(src, dst, op, WithOutMask(), stream);
+}
+template <typename T, typename D, typename UnOp>
+void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, const PtrStepb& mask, const UnOp& op, cudaStream_t stream = 0)
+{
+    detail::transform_caller(src, dst, op, SingleMask(mask), stream);
+}

-    template <typename T1, typename T2, typename D, typename BinOp>
-    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
-    {
-        detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
-    }
-    template <typename T1, typename T2, typename D, typename BinOp>
-    void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
-    {
-        detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
-    }
-}}}
+template <typename T1, typename T2, typename D, typename BinOp>
+void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const BinOp& op, cudaStream_t stream = 0)
+{
+    detail::transform_caller(src1, src2, dst, op, WithOutMask(), stream);
+}
+template <typename T1, typename T2, typename D, typename BinOp>
+void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, const PtrStepb& mask, const BinOp& op, cudaStream_t stream = 0)
+{
+    detail::transform_caller(src1, src2, dst, op, SingleMask(mask), stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_TRANSFORM_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@ -43,38 +43,40 @@
 #ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
 #define __OPENCV_GPU_TYPE_TRAITS_HPP__

+#include "internal_shared.hpp"
 #include "detail/type_traits_detail.hpp"

-namespace cv { namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename T> struct IsSimpleParameter
 {
-    template <typename T> struct IsSimpleParameter
-    {
-        enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
-    };
+    enum {value = detail::IsIntegral<T>::value || detail::IsFloat<T>::value || detail::PointerTraits<typename detail::ReferenceTraits<T>::type>::value};
+};

-    template <typename T> struct TypeTraits
-    {
-        typedef typename detail::UnConst<T>::type                                       NonConstType;
-        typedef typename detail::UnVolatile<T>::type                                    NonVolatileType;
-        typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type    UnqualifiedType;
-        typedef typename detail::PointerTraits<UnqualifiedType>::type                   PointeeType;
-        typedef typename detail::ReferenceTraits<T>::type                               ReferredType;
+template <typename T> struct TypeTraits
+{
+    typedef typename detail::UnConst<T>::type                                       NonConstType;
+    typedef typename detail::UnVolatile<T>::type                                    NonVolatileType;
+    typedef typename detail::UnVolatile<typename detail::UnConst<T>::type>::type    UnqualifiedType;
+    typedef typename detail::PointerTraits<UnqualifiedType>::type                   PointeeType;
+    typedef typename detail::ReferenceTraits<T>::type                               ReferredType;

-        enum { isConst          = detail::UnConst<T>::value };
-        enum { isVolatile       = detail::UnVolatile<T>::value };
+    enum { isConst          = detail::UnConst<T>::value };
+    enum { isVolatile       = detail::UnVolatile<T>::value };

-        enum { isReference      = detail::ReferenceTraits<UnqualifiedType>::value };
-        enum { isPointer        = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };        
+    enum { isReference      = detail::ReferenceTraits<UnqualifiedType>::value };
+    enum { isPointer        = detail::PointerTraits<typename detail::ReferenceTraits<UnqualifiedType>::type>::value };        

-        enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };
-        enum { isSignedInt   = detail::IsSignedIntergral<UnqualifiedType>::value };
-        enum { isIntegral    = detail::IsIntegral<UnqualifiedType>::value };
-        enum { isFloat       = detail::IsFloat<UnqualifiedType>::value  };
-        enum { isArith       = isIntegral || isFloat };
-        enum { isVec         = detail::IsVec<UnqualifiedType>::value  };
+    enum { isUnsignedInt = detail::IsUnsignedIntegral<UnqualifiedType>::value };
+    enum { isSignedInt   = detail::IsSignedIntergral<UnqualifiedType>::value };
+    enum { isIntegral    = detail::IsIntegral<UnqualifiedType>::value };
+    enum { isFloat       = detail::IsFloat<UnqualifiedType>::value  };
+    enum { isArith       = isIntegral || isFloat };
+    enum { isVec         = detail::IsVec<UnqualifiedType>::value  };
    
-        typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
-    };
-}}}
+    typedef typename detail::Select<IsSimpleParameter<UnqualifiedType>::value, T, typename detail::AddParameterType<T>::type>::type ParameterType;
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_TYPE_TRAITS_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@ -48,152 +48,168 @@
 #include "datamov_utils.hpp"
 #include "detail/utility_detail.hpp"

+BEGIN_OPENCV_DEVICE_NAMESPACE
+
 #define OPENCV_GPU_LOG_WARP_SIZE	    (5)
 #define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
 #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
 #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)

-namespace cv {  namespace gpu { namespace device
+///////////////////////////////////////////////////////////////////////////////
+// swap
+
+template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) 
 {
-    ///////////////////////////////////////////////////////////////////////////////
-    // swap
+    const T temp = a;
+    a = b;
+    b = temp;
+}

-    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) 
+///////////////////////////////////////////////////////////////////////////////
+// Mask Reader
+
+struct SingleMask
+{
+    explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
+    
+    __device__ __forceinline__ bool operator()(int y, int x) const
    {            
-        const T temp = a;
-        a = b;
-        b = temp;
+        return mask.ptr(y)[x] != 0;
    }

-    ///////////////////////////////////////////////////////////////////////////////
-    // Mask Reader
+    const PtrStepb mask;
+};

-    struct SingleMask
+struct MaskCollection
+{
+    explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
+
+    __device__ __forceinline__ void next()
    {
-        explicit __host__ __device__ __forceinline__ SingleMask(const PtrStepb& mask_) : mask(mask_) {}
-        
-        __device__ __forceinline__ bool operator()(int y, int x) const
-        {            
-            return mask.ptr(y)[x] != 0;
-        }
-
-        const PtrStepb mask;
-    };
-
-    struct MaskCollection
+        curMask = *maskCollection++;
+    }
+    __device__ __forceinline__ void setMask(int z)
    {
-        explicit __host__ __device__ __forceinline__ MaskCollection(PtrStepb* maskCollection_) : maskCollection(maskCollection_) {}
-
-        __device__ __forceinline__ void next()
-        {
-            curMask = *maskCollection++;
-        }
-        __device__ __forceinline__ void setMask(int z)
-        {
-            curMask = maskCollection[z];
-        }
-        
-        __device__ __forceinline__ bool operator()(int y, int x) const
-        {
-            uchar val;
-            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
-        }
-
-        const PtrStepb* maskCollection;
-        PtrStepb curMask;
-    };
-
-    struct WithOutMask
-    {
-        __device__ __forceinline__ void next() const
-        {
-        }
-        __device__ __forceinline__ void setMask(int) const
-        {
-        }
-
-        __device__ __forceinline__ bool operator()(int, int) const
-        {
-            return true;
-        }
-    };
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Reduction
-
-    template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
-    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
+        curMask = maskCollection[z];
    }
    
-    template <int n, typename T, typename V, typename Pred> 
-    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
+    __device__ __forceinline__ bool operator()(int y, int x) const
    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
+        uchar val;
+        return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(y), x, val), (val != 0));
    }

-    template <int n, typename T, typename V1, typename V2, typename Pred> 
-    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
+    const PtrStepb* maskCollection;
+    PtrStepb curMask;
+};
+
+struct WithOutMask
+{
+    __device__ __forceinline__ void next() const
+    {
+    }
+    __device__ __forceinline__ void setMask(int) const
    {
-        StaticAssert<n >= 8 && n <= 512>::check();
-        detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
    }

-    ///////////////////////////////////////////////////////////////////////////////
-    // Solve linear system
-
-    // solve 2x2 linear system Ax=b
-    template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+    __device__ __forceinline__ bool operator()(int, int) const
    {
-        T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
-
-        if (det != 0)
-        {
-            double invdet = 1.0 / det;
-
-            x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
-
-            x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
-
-            return true;
-        }
-
-        return false;
+        return true;
    }

-    // solve 3x3 linear system Ax=b
-    template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+    __device__ __forceinline__ bool operator()(int, int, int) const
    {
-        T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
-              - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
-              + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
-
-        if (det != 0)
-        {
-            double invdet = 1.0 / det;
-
-            x[0] = saturate_cast<T>(invdet * 
-                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
-                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
-                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
-
-            x[1] = saturate_cast<T>(invdet * 
-                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
-                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
-                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
-
-            x[2] = saturate_cast<T>(invdet * 
-                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
-                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
-                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
-
-            return true;
-        }
-
-        return false;
+        return true;
    }
-}}}
+
+    static __device__ __forceinline__ bool check(int, int)
+    {
+        return true;
+    }
+
+    static __device__ __forceinline__ bool check(int, int, int)
+    {
+        return true;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Reduction
+
+template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
+{
+    StaticAssert<n >= 8 && n <= 512>::check();
+    detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
+}
+
+template <int n, typename T, typename V, typename Pred> 
+__device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
+{
+    StaticAssert<n >= 8 && n <= 512>::check();
+    detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
+}
+
+template <int n, typename T, typename V1, typename V2, typename Pred> 
+__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
+{
+    StaticAssert<n >= 8 && n <= 512>::check();
+    detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
+}
+    
+///////////////////////////////////////////////////////////////////////////////
+// Solve linear system
+
+// solve 2x2 linear system Ax=b
+template <typename T> __device__ __forceinline__ bool solve2x2(const T A[2][2], const T b[2], T x[2])
+{
+    T det = A[0][0] * A[1][1] - A[1][0] * A[0][1];
+
+    if (det != 0)
+    {
+        double invdet = 1.0 / det;
+
+        x[0] = saturate_cast<T>(invdet * (b[0] * A[1][1] - b[1] * A[0][1]));
+
+        x[1] = saturate_cast<T>(invdet * (A[0][0] * b[1] - A[1][0] * b[0]));
+
+        return true;
+    }
+
+    return false;
+}
+
+// solve 3x3 linear system Ax=b
+template <typename T> __device__ __forceinline__ bool solve3x3(const T A[3][3], const T b[3], T x[3])
+{
+    T det = A[0][0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1])
+          - A[0][1] * (A[1][0] * A[2][2] - A[1][2] * A[2][0])
+          + A[0][2] * (A[1][0] * A[2][1] - A[1][1] * A[2][0]);
+
+    if (det != 0)
+    {
+        double invdet = 1.0 / det;
+
+        x[0] = saturate_cast<T>(invdet * 
+            (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
+             A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
+             A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
+
+        x[1] = saturate_cast<T>(invdet * 
+            (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
+             b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
+             A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
+
+        x[2] = saturate_cast<T>(invdet * 
+            (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
+             A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
+             b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
+
+        return true;
+    }
+
+    return false;
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@ -48,179 +48,179 @@
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"

-namespace cv {  namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+template <typename T> struct L1Dist
 {
+    typedef int value_type;
+    typedef int result_type;

-    template <typename T> struct L1Dist
+    __device__ __forceinline__ L1Dist() : mySum(0) {}
+
+    __device__ __forceinline__ void reduceIter(int val1, int val2)
    {
-        typedef int value_type;
-        typedef int result_type;
-
-        __device__ __forceinline__ L1Dist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum = __sad(val1, val2, mySum);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-    template <> struct L1Dist<float>
-    {
-        typedef float value_type;
-        typedef float result_type;
-
-        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            mySum += ::fabs(val1 - val2);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return mySum;
-        }
-
-        float mySum;
-    };
-
-    struct L2Dist
-    {
-        typedef float value_type;
-        typedef float result_type;
-
-        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
-
-        __device__ __forceinline__ void reduceIter(float val1, float val2)
-        {
-            float reg = val1 - val2;
-            mySum += reg * reg;
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
-        }
-
-        __device__ __forceinline__ operator float() const
-        {
-            return sqrtf(mySum);
-        }
-
-        float mySum;
-    };
-
-    struct HammingDist
-    {
-        typedef int value_type;
-        typedef int result_type;
-
-        __device__ __forceinline__ HammingDist() : mySum(0) {}
-
-        __device__ __forceinline__ void reduceIter(int val1, int val2)
-        {
-            mySum += __popc(val1 ^ val2);
-        }
-
-        template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
-        {
-            reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
-        }
-
-        __device__ __forceinline__ operator int() const
-        {
-            return mySum;
-        }
-
-        int mySum;
-    };
-
-    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
-    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {
-        for (int i = tid; i < len; i += THREAD_DIM)
-        {
-            T1 val1;
-            ForceGlob<T1>::Load(vec1, i, val1);
-
-            T2 val2;
-            ForceGlob<T2>::Load(vec2, i, val2);
-
-            dist.reduceIter(val1, val2);
-        }
-
-        dist.reduceAll<THREAD_DIM>(smem, tid);
+        mySum = __sad(val1, val2, mySum);
    }

-    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
-    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
    {
-        detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
-        
-        dist.reduceAll<THREAD_DIM>(smem, tid);
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
    }

-    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+    __device__ __forceinline__ operator int() const
    {
-        explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
-        {
-            vec1 = vec1_;
-        }
+        return mySum;
+    }

-        template <typename T2, typename Dist>
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-        {
-            calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
-        }
+    int mySum;
+};
+template <> struct L1Dist<float>
+{
+    typedef float value_type;
+    typedef float result_type;

-        const T1* vec1;
-    };
+    __device__ __forceinline__ L1Dist() : mySum(0.0f) {}

-    // calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
-    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+    __device__ __forceinline__ void reduceIter(float val1, float val2)
    {
-        template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
-        {
-            if (glob_tid < len)
-                smem[glob_tid] = vec1[glob_tid];
-            __syncthreads();
+        mySum += ::fabs(val1 - val2);
+    }

-            U* vec1ValsPtr = vec1Vals;
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+    {
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+    }

-            #pragma unroll
-            for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
-                *vec1ValsPtr++ = smem[i];
+    __device__ __forceinline__ operator float() const
+    {
+        return mySum;
+    }

-            __syncthreads();
-        }
+    float mySum;
+};

-        template <typename T2, typename Dist>
-        __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
-        {
-            calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
-        }
+struct L2Dist
+{
+    typedef float value_type;
+    typedef float result_type;

-        U vec1Vals[MAX_LEN / THREAD_DIM];
-    };
-}}}
+    __device__ __forceinline__ L2Dist() : mySum(0.0f) {}
+
+    __device__ __forceinline__ void reduceIter(float val1, float val2)
+    {
+        float reg = val1 - val2;
+        mySum += reg * reg;
+    }
+
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
+    {
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile float>());
+    }
+
+    __device__ __forceinline__ operator float() const
+    {
+        return sqrtf(mySum);
+    }
+
+    float mySum;
+};
+
+struct HammingDist
+{
+    typedef int value_type;
+    typedef int result_type;
+
+    __device__ __forceinline__ HammingDist() : mySum(0) {}
+
+    __device__ __forceinline__ void reduceIter(int val1, int val2)
+    {
+        mySum += __popc(val1 ^ val2);
+    }
+
+    template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
+    {
+        reduce<THREAD_DIM>(smem, mySum, tid, plus<volatile int>());
+    }
+
+    __device__ __forceinline__ operator int() const
+    {
+        return mySum;
+    }
+
+    int mySum;
+};
+
+// calc distance between two vectors in global memory
+template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
+__device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+{
+    for (int i = tid; i < len; i += THREAD_DIM)
+    {
+        T1 val1;
+        ForceGlob<T1>::Load(vec1, i, val1);
+
+        T2 val2;
+        ForceGlob<T2>::Load(vec2, i, val2);
+
+        dist.reduceIter(val1, val2);
+    }
+
+    dist.reduceAll<THREAD_DIM>(smem, tid);
+}
+
+// calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
+template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
+__device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
+{        
+    detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
+    
+    dist.reduceAll<THREAD_DIM>(smem, tid);
+}
+
+// calc distance between two vectors in global memory
+template <int THREAD_DIM, typename T1> struct VecDiffGlobal
+{
+    explicit __device__ __forceinline__ VecDiffGlobal(const T1* vec1_, int = 0, void* = 0, int = 0, int = 0)
+    {
+        vec1 = vec1_;
+    }
+
+    template <typename T2, typename Dist>
+    __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+    {
+        calcVecDiffGlobal<THREAD_DIM>(vec1, vec2, len, dist, smem, tid);
+    }
+
+    const T1* vec1;
+};
+
+// calc distance between two vectors, first vector is cached in register memory, second vector is in global memory
+template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename U> struct VecDiffCachedRegister
+{
+    template <typename T1> __device__ __forceinline__ VecDiffCachedRegister(const T1* vec1, int len, U* smem, int glob_tid, int tid)
+    {
+        if (glob_tid < len)
+            smem[glob_tid] = vec1[glob_tid];
+        __syncthreads();
+
+        U* vec1ValsPtr = vec1Vals;
+
+        #pragma unroll
+        for (int i = tid; i < MAX_LEN; i += THREAD_DIM)
+            *vec1ValsPtr++ = smem[i];
+
+        __syncthreads();
+    }
+
+    template <typename T2, typename Dist>
+    __device__ __forceinline__ void calc(const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid) const
+    {
+        calcVecDiffCached<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>(vec1Vals, vec2, len, dist, smem, tid);
+    }
+
+    U vec1Vals[MAX_LEN / THREAD_DIM];
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_VEC_DISTANCE_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@ -48,85 +48,85 @@
 #include "vec_traits.hpp"
 #include "functional.hpp"

-namespace cv {  namespace gpu { namespace device
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace detail
 {
-    namespace detail
+    template <int cn, typename VecD> struct SatCastHelper;
+    template <typename VecD> struct SatCastHelper<1, VecD>
    {
-        template <int cn, typename VecD> struct SatCastHelper;
-        template <typename VecD> struct SatCastHelper<1, VecD>
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<2, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<3, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
-            }
-        };
-        template <typename VecD> struct SatCastHelper<4, VecD>
-        {
-            template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
-            {
-                typedef typename VecTraits<VecD>::elem_type D;
-                return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
-            }
-        };
-
-        template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
-        {
-            return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x));
        }
+    };
+    template <typename VecD> struct SatCastHelper<2, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<3, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z));
+        }
+    };
+    template <typename VecD> struct SatCastHelper<4, VecD>
+    {
+        template <typename VecS> static __device__ __forceinline__ VecD cast(const VecS& v)
+        {
+            typedef typename VecTraits<VecD>::elem_type D;
+            return VecTraits<VecD>::make(saturate_cast<D>(v.x), saturate_cast<D>(v.y), saturate_cast<D>(v.z), saturate_cast<D>(v.w));
+        }
+    };
+
+    template <typename VecD, typename VecS> static __device__ __forceinline__ VecD saturate_cast_caller(const VecS& v)
+    {
+        return SatCastHelper<VecTraits<VecD>::cn, VecD>::cast(v);
    }
+}

-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float1& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double1& v) {return detail::saturate_cast_caller<_Tp>(v);}

-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float2& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double2& v) {return detail::saturate_cast_caller<_Tp>(v);}

-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float3& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double3& v) {return detail::saturate_cast_caller<_Tp>(v);}

-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
-    template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uchar4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const char4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const ushort4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const short4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const uint4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const int4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const float4& v) {return detail::saturate_cast_caller<_Tp>(v);}
+template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(const double4& v) {return detail::saturate_cast_caller<_Tp>(v);}

 #define OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, op, func) \
    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a) \
@ -150,49 +150,49 @@ namespace cv {  namespace gpu { namespace device
        return VecTraits<TypeVec<func<type>::result_type, 4>::vec_type>::make(f(a.x), f(a.y), f(a.z), f(a.w)); \
    }

-    namespace detail
+namespace detail
+{    
+    template <typename T1, typename T2> struct BinOpTraits
    {
-        template <typename T1, typename T2> struct BinOpTraits
-        {
-            typedef int argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, T>
-        {
-            typedef T argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, double>
-        {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<double, T>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<double, double>
-        {
-            typedef double argument_type;
-        };
-        template <typename T> struct BinOpTraits<T, float>
-        {
-            typedef float argument_type;
-        };
-        template <typename T> struct BinOpTraits<float, T>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<float, float>
-        {
-            typedef float argument_type;
-        };
-        template <> struct BinOpTraits<double, float>
-        {
-            typedef double argument_type;
-        };
-        template <> struct BinOpTraits<float, double>
-        {
-            typedef double argument_type;
-        };
-    }
+        typedef int argument_type;
+    };
+    template <typename T> struct BinOpTraits<T, T>
+    {
+        typedef T argument_type;
+    };
+    template <typename T> struct BinOpTraits<T, double>
+    {
+        typedef double argument_type;
+    };
+    template <typename T> struct BinOpTraits<double, T>
+    {
+        typedef double argument_type;
+    };
+    template <> struct BinOpTraits<double, double>
+    {
+        typedef double argument_type;
+    };
+    template <typename T> struct BinOpTraits<T, float>
+    {
+        typedef float argument_type;
+    };
+    template <typename T> struct BinOpTraits<float, T>
+    {
+        typedef float argument_type;
+    };
+    template <> struct BinOpTraits<float, float>
+    {
+        typedef float argument_type;
+    };
+    template <> struct BinOpTraits<double, float>
+    {
+        typedef double argument_type;
+    };
+    template <> struct BinOpTraits<float, double>
+    {
+        typedef double argument_type;
+    };
+}

 #define OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, op, func) \
    __device__ __forceinline__ TypeVec<func<type>::result_type, 1>::vec_type op(const type ## 1 & a, const type ## 1 & b) \
@ -313,19 +313,20 @@ namespace cv {  namespace gpu { namespace device
    OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, operator ^, bit_xor) \
    OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ~, bit_not)

-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
-    OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(float)
-    OPENCV_GPU_IMPLEMENT_VEC_OP(double)
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uchar)
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(char)
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(ushort)
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(short)
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(int)
+OPENCV_GPU_IMPLEMENT_VEC_INT_OP(uint)
+OPENCV_GPU_IMPLEMENT_VEC_OP(float)
+OPENCV_GPU_IMPLEMENT_VEC_OP(double)

 #undef OPENCV_GPU_IMPLEMENT_VEC_UNOP
 #undef OPENCV_GPU_IMPLEMENT_VEC_BINOP
 #undef OPENCV_GPU_IMPLEMENT_VEC_OP
 #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
-}}}
+
+END_OPENCV_DEVICE_NAMESPACE
        
 #endif // __OPENCV_GPU_VECMATH_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@ -45,82 +45,82 @@

 #include "internal_shared.hpp"

-namespace cv { namespace gpu { namespace device
-{
-    template<typename T, int N> struct TypeVec;
+BEGIN_OPENCV_DEVICE_NAMESPACE

-    struct __align__(8) uchar8
-    {
-        uchar a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
-    {
-        uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(8) char8
-    {
-        schar a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
-    {
-        char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(16) ushort8
-    {
-        ushort a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
-    {
-        ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(16) short8
-    {
-        short a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
-    {
-        short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(32) uint8
-    {
-        uint a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
-    {
-        uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(32) int8
-    {
-        int a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
-    {
-        int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct __align__(32) float8
-    {
-        float a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
-    {
-        float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
-    struct double8
-    {
-        double a0, a1, a2, a3, a4, a5, a6, a7;
-    };
-    static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
-    {
-        double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
-        return val;
-    }
+template<typename T, int N> struct TypeVec;
+
+struct __align__(8) uchar8
+{
+    uchar a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ uchar8 make_uchar8(uchar a0, uchar a1, uchar a2, uchar a3, uchar a4, uchar a5, uchar a6, uchar a7)
+{
+    uchar8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct __align__(8) char8
+{
+    schar a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ char8 make_char8(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7)
+{
+    char8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct __align__(16) ushort8
+{
+    ushort a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ ushort8 make_ushort8(ushort a0, ushort a1, ushort a2, ushort a3, ushort a4, ushort a5, ushort a6, ushort a7)
+{
+    ushort8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct __align__(16) short8
+{
+    short a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ short8 make_short8(short a0, short a1, short a2, short a3, short a4, short a5, short a6, short a7)
+{
+    short8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct __align__(32) uint8
+{
+    uint a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ uint8 make_uint8(uint a0, uint a1, uint a2, uint a3, uint a4, uint a5, uint a6, uint a7)
+{
+    uint8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct __align__(32) int8
+{
+    int a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ int8 make_int8(int a0, int a1, int a2, int a3, int a4, int a5, int a6, int a7)
+{
+    int8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct __align__(32) float8
+{
+    float a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ float8 make_float8(float a0, float a1, float a2, float a3, float a4, float a5, float a6, float a7)
+{
+    float8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}
+struct double8
+{
+    double a0, a1, a2, a3, a4, a5, a6, a7;
+};
+static __host__ __device__ __forceinline__ double8 make_double8(double a0, double a1, double a2, double a3, double a4, double a5, double a6, double a7)
+{
+    double8 val = {a0, a1, a2, a3, a4, a5, a6, a7};
+    return val;
+}

 #define OPENCV_GPU_IMPLEMENT_TYPE_VEC(type) \
    template<> struct TypeVec<type, 1> { typedef type vec_type; }; \
@ -134,28 +134,28 @@ namespace cv { namespace gpu { namespace device
    template<> struct TypeVec<type, 8> { typedef type ## 8 vec_type; }; \
    template<> struct TypeVec<type ## 8, 8> { typedef type ## 8 vec_type; };

-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
-    OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(uchar)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(char)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(ushort)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(short)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(int)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(uint)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(float)
+OPENCV_GPU_IMPLEMENT_TYPE_VEC(double)

 #undef OPENCV_GPU_IMPLEMENT_TYPE_VEC

-    template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
-    template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
-    template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
-    template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
-    template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };
+template<> struct TypeVec<schar, 1> { typedef schar vec_type; };
+template<> struct TypeVec<schar, 2> { typedef char2 vec_type; };
+template<> struct TypeVec<schar, 3> { typedef char3 vec_type; };
+template<> struct TypeVec<schar, 4> { typedef char4 vec_type; };
+template<> struct TypeVec<schar, 8> { typedef char8 vec_type; };

-    template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
-    template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
-    template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
-    template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
-    template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };
+template<> struct TypeVec<bool, 1> { typedef uchar vec_type; };
+template<> struct TypeVec<bool, 2> { typedef uchar2 vec_type; };
+template<> struct TypeVec<bool, 3> { typedef uchar3 vec_type; };
+template<> struct TypeVec<bool, 4> { typedef uchar4 vec_type; };
+template<> struct TypeVec<bool, 8> { typedef uchar8 vec_type; };

    template<typename T> struct VecTraits;

@ -209,72 +209,73 @@ namespace cv { namespace gpu { namespace device
        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
    };

-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uint)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(float)
+OPENCV_GPU_IMPLEMENT_VEC_TRAITS(double)

 #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS

-    template<> struct VecTraits<char> 
-    { 
-        typedef char elem_type; 
-        enum {cn=1}; 
-        static __device__ __host__ __forceinline__ char all(char v) {return v;}
-        static __device__ __host__ __forceinline__ char make(char x) {return x;}
-        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
-    };
-    template<> struct VecTraits<schar> 
-    { 
-        typedef schar elem_type; 
-        enum {cn=1}; 
-        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
-        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
-        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
-    };
-    template<> struct VecTraits<char1>
-    {
-        typedef schar elem_type;
-        enum {cn=1};
-        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
-        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
-        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
-    };
-    template<> struct VecTraits<char2>
-    {
-        typedef schar elem_type;
-        enum {cn=2};
-        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
-        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
-        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
-    };
-    template<> struct VecTraits<char3>
-    {
-        typedef schar elem_type;
-        enum {cn=3};
-        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
-        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
-        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
-    };
-    template<> struct VecTraits<char4>
-    {
-        typedef schar elem_type;
-        enum {cn=4};
-        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
-        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
-        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
-    };
-    template<> struct VecTraits<char8>
-    {
-        typedef schar elem_type;
-        enum {cn=8};
-        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
-        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
-        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
-    };
-}}}
+template<> struct VecTraits<char> 
+{ 
+    typedef char elem_type; 
+    enum {cn=1}; 
+    static __device__ __host__ __forceinline__ char all(char v) {return v;}
+    static __device__ __host__ __forceinline__ char make(char x) {return x;}
+    static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+};
+template<> struct VecTraits<schar> 
+{ 
+    typedef schar elem_type; 
+    enum {cn=1}; 
+    static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
+    static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+    static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+};
+template<> struct VecTraits<char1>
+{
+    typedef schar elem_type;
+    enum {cn=1};
+    static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+    static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+    static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+};
+template<> struct VecTraits<char2>
+{
+    typedef schar elem_type;
+    enum {cn=2};
+    static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+    static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+    static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+};
+template<> struct VecTraits<char3>
+{
+    typedef schar elem_type;
+    enum {cn=3};
+    static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+    static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+    static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+};
+template<> struct VecTraits<char4>
+{
+    typedef schar elem_type;
+    enum {cn=4};
+    static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+    static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+    static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+};
+template<> struct VecTraits<char8>
+{
+    typedef schar elem_type;
+    enum {cn=8};
+    static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+    static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+    static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
+};
+
+END_OPENCV_DEVICE_NAMESPACE

 #endif // __OPENCV_GPU_VEC_TRAITS_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
@ -40,79 +40,76 @@
 //
 //M*/

-#ifndef __OPENCV_GPU_DEVICE_WARP_HPP_
-#define __OPENCV_GPU_DEVICE_WARP_HPP_
+#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
+#define __OPENCV_GPU_DEVICE_WARP_HPP__

-namespace cv
+#include "internal_shared.hpp"
+
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+struct Warp
 {
-    namespace gpu
+    enum
    {
-        namespace device
-        {
-            struct Warp
-            {
-                enum
-                {
-                    LOG_WARP_SIZE = 5,
-                    WARP_SIZE     = 1 << LOG_WARP_SIZE,
-                    STRIDE        = WARP_SIZE
-                };
+        LOG_WARP_SIZE = 5,
+        WARP_SIZE     = 1 << LOG_WARP_SIZE,
+        STRIDE        = WARP_SIZE
+    };

-                /** \brief Returns the warp lane ID of the calling thread. */
-                static __device__ __forceinline__ unsigned int laneId()
-                {
-                    unsigned int ret;
-                    asm("mov.u32 %0, %laneid;" : "=r"(ret) );
-                    return ret;
-                }
-
-                template<typename It, typename T>
-                static __device__ __forceinline__ void fill(It beg, It end, const T& value)
-                {                
-                    for(It t = beg + laneId(); t < end; t += STRIDE)
-                        *t = value;
-                }            
-
-                template<typename InIt, typename OutIt>
-                static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
-                {                
-                    for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
-                        *out = *t;
-                    return out;
-                }            
-
-                template<typename InIt, typename OutIt, class UnOp>
-                static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
-                {
-                    for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
-                        *out = op(*t);
-                    return out;
-                }
-
-                template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
-                static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
-                {
-                    unsigned int lane = laneId();
-
-                    InIt1 t1 = beg1 + lane; 
-                    InIt2 t2 = beg2 + lane;
-                    for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
-                        *out = op(*t1, *t2);
-                    return out;
-                }
-
-                template<typename OutIt, typename T>
-                static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
-                {
-                    unsigned int lane = laneId();                
-                    value += lane;
-
-                    for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
-                        *t = value;
-                }
-            };
-        }
+    /** \brief Returns the warp lane ID of the calling thread. */
+    static __device__ __forceinline__ unsigned int laneId()
+    {
+        unsigned int ret;
+        asm("mov.u32 %0, %laneid;" : "=r"(ret) );
+        return ret;
    }
-}

-#endif /* __OPENCV_GPU_DEVICE_WARP_HPP_ */
+    template<typename It, typename T>
+    static __device__ __forceinline__ void fill(It beg, It end, const T& value)
+    {                
+        for(It t = beg + laneId(); t < end; t += STRIDE)
+            *t = value;
+    }            
+
+    template<typename InIt, typename OutIt>
+    static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
+    {                
+        for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+            *out = *t;
+        return out;
+    }            
+
+    template<typename InIt, typename OutIt, class UnOp>
+    static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
+    {
+        for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
+            *out = op(*t);
+        return out;
+    }
+
+    template<typename InIt1, typename InIt2, typename OutIt, class BinOp>
+    static __device__ __forceinline__ OutIt transform(InIt1 beg1, InIt1 end1, InIt2 beg2, OutIt out, BinOp op)
+    {
+        unsigned int lane = laneId();
+
+        InIt1 t1 = beg1 + lane; 
+        InIt2 t2 = beg2 + lane;
+        for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
+            *out = op(*t1, *t2);
+        return out;
+    }
+
+    template<typename OutIt, typename T>
+    static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
+    {
+        unsigned int lane = laneId();                
+        value += lane;
+
+        for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
+            *t = value;
+    }
+};
+
+END_OPENCV_DEVICE_NAMESPACE
+
+#endif /* __OPENCV_GPU_DEVICE_WARP_HPP__ */
--- a/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp
@ -41,33 +41,32 @@
 //M*/


-#ifndef OPENCV_GPU_WARP_REDUCE_HPP_
-#define OPENCV_GPU_WARP_REDUCE_HPP_
+#ifndef OPENCV_GPU_WARP_REDUCE_HPP__
+#define OPENCV_GPU_WARP_REDUCE_HPP__

+#include "internal_shared.hpp"

-namespace cv
+BEGIN_OPENCV_DEVICE_NAMESPACE
+		
+template <class T> 
+__device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )
 {
-	namespace device
+    const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+				
+	if (lane < 16)
 	{				
-		 template <class T> 
-        __device__ __forceinline__ T warp_reduce ( volatile T *ptr , const unsigned int tid = threadIdx.x )
-        {
-            const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+		T partial = ptr[tid];

-			if (lane < 16)
-			{				
-				T partial = ptr[tid];
-
-				ptr[tid] = partial = partial + ptr[tid + 16];
-				ptr[tid] = partial = partial + ptr[tid + 8];
-				ptr[tid] = partial = partial + ptr[tid + 4];
-				ptr[tid] = partial = partial + ptr[tid + 2];
-				ptr[tid] = partial = partial + ptr[tid + 1];            
-			}
-			return ptr[tid - lane];
-
-        }
+		ptr[tid] = partial = partial + ptr[tid + 16];
+		ptr[tid] = partial = partial + ptr[tid + 8];
+		ptr[tid] = partial = partial + ptr[tid + 4];
+		ptr[tid] = partial = partial + ptr[tid + 2];
+		ptr[tid] = partial = partial + ptr[tid + 1];            
 	}
+
+	return ptr[tid - lane];
 }

-#endif /* OPENCV_GPU_WARP_REDUCE_HPP_ */
+END_OPENCV_DEVICE_NAMESPACE
+
+#endif /* OPENCV_GPU_WARP_REDUCE_HPP__ */
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
@ -42,6 +42,8 @@

 #include "precomp.hpp"

+using namespace cv;
+using namespace cv::gpu;
 using namespace std;

 #if !defined (HAVE_CUDA)
@ -53,25 +55,25 @@ void cv::gpu::split(const GpuMat& /*src*/, vector<GpuMat>& /*dst*/, Stream& /*st

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace split_merge 
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace split_merge 
 {    
-    extern "C" void merge_caller(const DevMem2Db* src, DevMem2Db& dst, 
-                                 int total_channels, size_t elem_size, 
-                                 const cudaStream_t& stream);
+    void merge_caller(const DevMem2Db* src, DevMem2Db& dst, int total_channels, size_t elem_size, const cudaStream_t& stream);
+    void split_caller(const DevMem2Db& src, DevMem2Db* dst, int num_channels, size_t elem_size1, const cudaStream_t& stream);
+}

-    extern "C" void split_caller(const DevMem2Db& src, DevMem2Db* dst, 
-                                 int num_channels, size_t elem_size1, 
-                                 const cudaStream_t& stream);
+END_OPENCV_DEVICE_NAMESPACE

+namespace
+{
    void merge(const GpuMat* src, size_t n, GpuMat& dst, const cudaStream_t& stream) 
    {
+        using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;
+
        CV_Assert(src);
        CV_Assert(n > 0);

-        bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && 
-                         DeviceInfo().supports(NATIVE_DOUBLE);
-        CV_Assert(src[0].depth() != CV_64F || double_ok);
-
        int depth = src[0].depth();
        Size size = src[0].size();

@ -100,20 +102,15 @@ namespace cv { namespace gpu { namespace split_merge
                src_as_devmem[i] = src[i];

            DevMem2Db dst_as_devmem(dst);
-            split_merge::merge_caller(src_as_devmem, dst_as_devmem,
-                                      total_channels, CV_ELEM_SIZE(depth),
-                                      stream);
+            merge_caller(src_as_devmem, dst_as_devmem, total_channels, CV_ELEM_SIZE(depth), stream);
        }   
    }

-
    void split(const GpuMat& src, GpuMat* dst, const cudaStream_t& stream) 
    {
-        CV_Assert(dst);
+        using namespace OPENCV_DEVICE_NAMESPACE_ split_merge;

-        bool double_ok = TargetArchs::builtWith(NATIVE_DOUBLE) && 
-                         DeviceInfo().supports(NATIVE_DOUBLE);
-        CV_Assert(src.depth() != CV_64F || double_ok);
+        CV_Assert(dst);

        int depth = src.depth();
        int num_channels = src.channels();
@ -135,38 +132,31 @@ namespace cv { namespace gpu { namespace split_merge
            dst_as_devmem[i] = dst[i];

        DevMem2Db src_as_devmem(src);
-        split_merge::split_caller(src_as_devmem, dst_as_devmem,
-                                  num_channels, src.elemSize1(), 
-                                  stream);
+        split_caller(src_as_devmem, dst_as_devmem, num_channels, src.elemSize1(), stream);
    }
-
-
-}}}
-
+}

 void cv::gpu::merge(const GpuMat* src, size_t n, GpuMat& dst, Stream& stream) 
 { 
-    split_merge::merge(src, n, dst, StreamAccessor::getStream(stream));
+    ::merge(src, n, dst, StreamAccessor::getStream(stream));
 }


 void cv::gpu::merge(const vector<GpuMat>& src, GpuMat& dst, Stream& stream) 
 {
-    split_merge::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
+    ::merge(&src[0], src.size(), dst, StreamAccessor::getStream(stream));
 }

-
 void cv::gpu::split(const GpuMat& src, GpuMat* dst, Stream& stream) 
 {
-    split_merge::split(src, dst, StreamAccessor::getStream(stream));
+    ::split(src, dst, StreamAccessor::getStream(stream));
 }

-
 void cv::gpu::split(const GpuMat& src, vector<GpuMat>& dst, Stream& stream) 
 {
    dst.resize(src.channels());
    if(src.channels() > 0)
-        split_merge::split(src, &dst[0], StreamAccessor::getStream(stream));
+        ::split(src, &dst[0], StreamAccessor::getStream(stream));
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
@ -55,21 +55,23 @@ void cv::gpu::StereoBM_GPU::operator() ( const GpuMat&, const GpuMat&, GpuMat&,

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace stereobm
 {
-    namespace bm
-    {
-        //extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf);
-        extern "C" void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);
-        extern "C" void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
-        extern "C" void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
-    }
-}}
+    void stereoBM_GPU(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& disp, int ndisp, int winsz, const DevMem2D_<uint>& minSSD_buf, cudaStream_t & stream);
+    void prefilter_xsobel(const DevMem2Db& input, const DevMem2Db& output, int prefilterCap /*= 31*/, cudaStream_t & stream);
+    void postfilter_textureness(const DevMem2Db& input, int winsz, float avgTexturenessThreshold, const DevMem2Db& disp, cudaStream_t & stream);
+}
+
+END_OPENCV_DEVICE_NAMESPACE

 const float defaultAvgTexThreshold = 3;

 cv::gpu::StereoBM_GPU::StereoBM_GPU()
-    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)  {}
+    : preset(BASIC_PRESET), ndisp(DEFAULT_NDISP), winSize(DEFAULT_WINSZ), avergeTexThreshold(defaultAvgTexThreshold)  
+{
+}

 cv::gpu::StereoBM_GPU::StereoBM_GPU(int preset_, int ndisparities_, int winSize_)
    : preset(preset_), ndisp(ndisparities_), winSize(winSize_), avergeTexThreshold(defaultAvgTexThreshold)
@ -93,39 +95,44 @@ bool cv::gpu::StereoBM_GPU::checkIfGpuCallReasonable()
    return false;
 }

-static void stereo_bm_gpu_operator ( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
+namespace
 {
-    CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
-    CV_DbgAssert(left.type() == CV_8UC1);
-    CV_DbgAssert(right.type() == CV_8UC1);
-
-    disparity.create(left.size(), CV_8U);
-    minSSD.create(left.size(), CV_32S);
-
-    GpuMat le_for_bm =  left;
-    GpuMat ri_for_bm = right;
-
-    if (preset == StereoBM_GPU::PREFILTER_XSOBEL)
+    void stereo_bm_gpu_operator( GpuMat& minSSD,  GpuMat& leBuf, GpuMat&  riBuf,  int preset, int ndisp, int winSize, float avergeTexThreshold, const GpuMat& left, const GpuMat& right, GpuMat& disparity, cudaStream_t stream)
    {
-        leBuf.create( left.size(),  left.type());
-        riBuf.create(right.size(), right.type());
+        using namespace OPENCV_DEVICE_NAMESPACE_ stereobm;

-		bm::prefilter_xsobel( left, leBuf, 31, stream);
-        bm::prefilter_xsobel(right, riBuf, 31, stream);
+        CV_DbgAssert(left.rows == right.rows && left.cols == right.cols);
+        CV_DbgAssert(left.type() == CV_8UC1);
+        CV_DbgAssert(right.type() == CV_8UC1);

-        le_for_bm = leBuf;
-        ri_for_bm = riBuf;
+        disparity.create(left.size(), CV_8U);
+        minSSD.create(left.size(), CV_32S);
+
+        GpuMat le_for_bm =  left;
+        GpuMat ri_for_bm = right;
+
+        if (preset == StereoBM_GPU::PREFILTER_XSOBEL)
+        {
+            leBuf.create( left.size(),  left.type());
+            riBuf.create(right.size(), right.type());
+
+		    prefilter_xsobel( left, leBuf, 31, stream);
+            prefilter_xsobel(right, riBuf, 31, stream);
+
+            le_for_bm = leBuf;
+            ri_for_bm = riBuf;
+        }
+
+        stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);
+
+        if (avergeTexThreshold)
+            postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
    }
-
-    bm::stereoBM_GPU(le_for_bm, ri_for_bm, disparity, ndisp, winSize, minSSD, stream);
-
-    if (avergeTexThreshold)
-        bm::postfilter_textureness(le_for_bm, winSize, avergeTexThreshold, disparity, stream);
 }

 void cv::gpu::StereoBM_GPU::operator() ( const GpuMat& left, const GpuMat& right, GpuMat& disparity, Stream& stream)
 {
-    ::stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
+    stereo_bm_gpu_operator(minSSD, leBuf, riBuf, preset, ndisp, winSize, avergeTexThreshold, left, right, disparity, StreamAccessor::getStream(stream));
 }

 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
@ -59,7 +59,9 @@ void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat&, GpuMat&, Stream

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace bp
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace stereobp
 {
    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump);
    template<typename T, typename D>
@ -74,7 +76,11 @@ namespace cv { namespace gpu { namespace bp
    template <typename T>
    void output_gpu(const DevMem2Db& u, const DevMem2Db& d, const DevMem2Db& l, const DevMem2Db& r, const DevMem2Db& data, 
        const DevMem2D_<short>& disp, cudaStream_t stream);
-}}}
+}
+
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE_ stereobp;

 namespace
 {
@ -84,7 +90,6 @@ namespace
    const float DEFAULT_DISC_SINGLE_JUMP = 1.0f;
 }

-
 void cv::gpu::StereoBeliefPropagation::estimateRecommendedParams(int width, int height, int& ndisp, int& iters, int& levels)
 {
    ndisp = width / 4;
@ -136,8 +141,8 @@ namespace
            typedef void (*comp_data_t)(const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& data, cudaStream_t stream);
            static const comp_data_t comp_data_callers[2][5] = 
            {
-                {0, bp::comp_data_gpu<unsigned char, short>, 0, bp::comp_data_gpu<uchar3, short>, bp::comp_data_gpu<uchar4, short>},
-                {0, bp::comp_data_gpu<unsigned char, float>, 0, bp::comp_data_gpu<uchar3, float>, bp::comp_data_gpu<uchar4, float>}
+                {0, comp_data_gpu<unsigned char, short>, 0, comp_data_gpu<uchar3, short>, comp_data_gpu<uchar4, short>},
+                {0, comp_data_gpu<unsigned char, float>, 0, comp_data_gpu<uchar3, float>, comp_data_gpu<uchar4, float>}
            };

            CV_Assert(left.size() == right.size() && left.type() == right.type());
@ -236,7 +241,7 @@ namespace
                }
            }

-            bp::load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);
+            load_constants(rthis.ndisp, rthis.max_data_term, scale * rthis.data_weight, scale * rthis.max_disc_term, scale * rthis.disc_single_jump);

            datas.resize(rthis.levels);

@ -249,8 +254,6 @@ namespace

        void calcBP(GpuMat& disp, Stream& stream)
        {
-            using namespace cv::gpu::bp;
-
            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
            static const data_step_down_t data_step_down_callers[2] = 
            {
@ -354,13 +357,13 @@ namespace

 void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& left, const GpuMat& right, GpuMat& disp, Stream& stream)
 {
-    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
+    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
    impl(left, right, disp, stream);
 }

 void cv::gpu::StereoBeliefPropagation::operator()(const GpuMat& data, GpuMat& disp, Stream& stream)
 {
-    ::StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
+    StereoBeliefPropagationImpl impl(*this, u, d, l, r, u2, d2, l2, r2, datas, out);
    impl(data, disp, stream);
 }

--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
@ -57,7 +57,9 @@ void cv::gpu::StereoConstantSpaceBP::operator()(const GpuMat&, const GpuMat&, Gp

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace csbp
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace stereocsbp
 {
    void load_constants(int ndisp, float max_data_term, float data_weight, float max_disc_term, float disc_single_jump, int min_disp_th,
        const DevMem2Db& left, const DevMem2Db& right, const DevMem2Db& temp);
@ -84,8 +86,11 @@ namespace cv { namespace gpu { namespace csbp
    template<class T> 
    void compute_disp(const T* u, const T* d, const T* l, const T* r, const T* data_cost_selected, const T* disp_selected, size_t msg_step,
        const DevMem2D_<short>& disp, int nr_plane, cudaStream_t stream);
+}

-}}}
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE_ stereocsbp;

 namespace
 {
@ -208,8 +213,7 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
    ////////////////////////////////////////////////////////////////////////////
    // Compute

-    csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
-        rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
+    load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight, rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);

    if (stream)
    {
@ -248,28 +252,28 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
    {
        if (i == levels - 1)
        {
-            csbp::init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
+            init_data_cost(left.rows, left.cols, disp_selected_pyr[cur_idx].ptr<T>(), data_cost_selected.ptr<T>(),
                step_pyr[i], rows_pyr[i], cols_pyr[i], i, nr_plane_pyr[i], rthis.ndisp, left.channels(), rthis.use_local_init_data_cost, cudaStream);
        }
        else
        {
-            csbp::compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
+            compute_data_cost(disp_selected_pyr[cur_idx].ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1],
                left.rows, left.cols, rows_pyr[i], cols_pyr[i], rows_pyr[i+1], i, nr_plane_pyr[i+1], left.channels(), cudaStream);

            int new_idx = (cur_idx + 1) & 1;

-            csbp::init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
-                               u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                               disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
-                               data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
-                               cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);
+            init_message(u[new_idx].ptr<T>(), d[new_idx].ptr<T>(), l[new_idx].ptr<T>(), r[new_idx].ptr<T>(),
+                         u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
+                         disp_selected_pyr[new_idx].ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(),
+                         data_cost_selected.ptr<T>(), data_cost.ptr<T>(), step_pyr[i], step_pyr[i+1], rows_pyr[i],
+                         cols_pyr[i], nr_plane_pyr[i], rows_pyr[i+1], cols_pyr[i+1], nr_plane_pyr[i+1], cudaStream);

            cur_idx = new_idx;
        }

-        csbp::calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                                  data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
-                                  rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
+        calc_all_iterations(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
+                            data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[i],
+                            rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rthis.iters, cudaStream);
    }

    if (disp.empty())
@ -282,8 +286,8 @@ static void csbp_operator(StereoConstantSpaceBP& rthis, GpuMat u[2], GpuMat d[2]
    else
        out.setTo(zero);

-    csbp::compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
-                       data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);
+    compute_disp(u[cur_idx].ptr<T>(), d[cur_idx].ptr<T>(), l[cur_idx].ptr<T>(), r[cur_idx].ptr<T>(),
+                 data_cost_selected.ptr<T>(), disp_selected_pyr[cur_idx].ptr<T>(), step_pyr[0], out, nr_plane_pyr[0], cudaStream);

    if (disp.type() != CV_16S)
    {
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@ -63,8 +63,17 @@ void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace surf
+BEGIN_OPENCV_DEVICE_NAMESPACE
+
+namespace surf
 {
+    void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
+    void loadOctaveConstants(int octave, int layer_rows, int layer_cols);
+
+    void bindImgTex(DevMem2Db img);
+    void bindSumTex(DevMem2D_<uint> sum);
+    void bindMaskSumTex(DevMem2D_<uint> maskSum);
+
    void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols, int octave, int nOctaveLayers);

    void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
@ -78,9 +87,11 @@ namespace cv { namespace gpu { namespace surf

    void compute_descriptors_gpu(const DevMem2Df& descriptors, 
        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
-}}}
+}

-using namespace cv::gpu::surf;
+END_OPENCV_DEVICE_NAMESPACE
+
+using namespace OPENCV_DEVICE_NAMESPACE_ surf;

 namespace
 {
@ -136,24 +147,18 @@ namespace
            counters.create(1, nOctaves + 1, CV_32SC1);
            counters.setTo(Scalar::all(0));

-            uploadConstant("cv::gpu::surf::c_max_candidates",    maxCandidates);
-            uploadConstant("cv::gpu::surf::c_max_features",      maxFeatures);
-            uploadConstant("cv::gpu::surf::c_img_rows",          img_rows);
-            uploadConstant("cv::gpu::surf::c_img_cols",          img_cols);
-            uploadConstant("cv::gpu::surf::c_nOctaveLayers",     nOctaveLayers);
-            uploadConstant("cv::gpu::surf::c_hessianThreshold",  static_cast<float>(hessianThreshold));
+            loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, nOctaveLayers, static_cast<float>(hessianThreshold));

-            imgTex.bind("cv::gpu::surf::imgTex", (DevMem2Db)img);
+            bindImgTex(img);

            integralBuffered(img, sum, intBuffer);
-            sumTex.bind("cv::gpu::surf::sumTex", (DevMem2D_<unsigned int>)sum);
+            bindSumTex(sum);

            if (use_mask)
            {
                min(mask, 1.0, mask1);
                integralBuffered(mask1, maskSum, intBuffer);
-
-                maskSumTex.bind("cv::gpu::surf::maskSumTex", (DevMem2D_<unsigned int>)maskSum);
+                bindMaskSumTex(maskSum);
            }
        }

@ -171,9 +176,7 @@ namespace
                const int layer_rows = img_rows >> octave;
                const int layer_cols = img_cols >> octave;

-                uploadConstant("cv::gpu::surf::c_octave",     octave);
-                uploadConstant("cv::gpu::surf::c_layer_rows", layer_rows);
-                uploadConstant("cv::gpu::surf::c_layer_cols", layer_cols);
+                loadOctaveConstants(octave, layer_rows, layer_cols);

                icvCalcLayerDetAndTrace_gpu(det, trace, img_rows, img_cols, octave, nOctaveLayers);

@ -242,8 +245,6 @@ namespace
        int maxFeatures;

        GpuMat counters;
-
-        TextureBinder imgTex, sumTex, maskSumTex;
    };
 }

@ -336,7 +337,7 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
    {
        CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE);
        
-        Mat keypointsCPU = keypointsGPU;
+        Mat keypointsCPU(keypointsGPU);
        
        keypoints.resize(nFeatures);

--- a/modules/gpu/test/test_filters.cpp
+++ b/modules/gpu/test/test_filters.cpp
@ -549,8 +549,8 @@ TEST_P(MorphEx, Accuracy)
        cv::gpu::GpuMat dev_dst_rgba;
        cv::gpu::GpuMat dev_dst_gray;

-        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_rgba), dev_dst_rgba, morphOps[morphOpsIdx], cv::gpu::GpuMat(kernel));
-        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_gray), dev_dst_gray, morphOps[morphOpsIdx], cv::gpu::GpuMat(kernel));
+        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_rgba), dev_dst_rgba, morphOps[morphOpsIdx], kernel);
+        cv::gpu::morphologyEx(cv::gpu::GpuMat(img_gray), dev_dst_gray, morphOps[morphOpsIdx], kernel);

        dev_dst_rgba.download(dst_rgba);
        dev_dst_gray.download(dst_gray);
--- a/modules/gpu/test/test_hog.cpp
+++ b/modules/gpu/test/test_hog.cpp
@ -137,7 +137,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
 #ifdef DUMP
        dump(block_hists, locations);
 #else
-        compare(block_hists, locations);
+        compare(cv::Mat(block_hists), locations);
 #endif

        // Test detect on smaller image
@ -148,7 +148,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
 #ifdef DUMP
        dump(block_hists, locations);
 #else
-        compare(block_hists, locations);
+        compare(cv::Mat(block_hists), locations);
 #endif

        // Test detect on greater image
@ -158,7 +158,7 @@ struct CV_GpuHogDetectTestRunner : cv::gpu::HOGDescriptor
 #ifdef DUMP
        dump(block_hists, locations);
 #else
-        compare(block_hists, locations);
+        compare(cv::Mat(block_hists), locations);
 #endif
    }

@ -254,31 +254,31 @@ struct CV_GpuHogGetDescriptorsTestRunner : cv::gpu::HOGDescriptor
        ASSERT_TRUE(!img_rgb.empty());
        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
        computeBlockHistograms(cv::gpu::GpuMat(img));
-        compare_inner_parts(block_hists, descriptors.rowRange(1, 2));
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(1, 2)));

        img_rgb = readImage("hog/negative1.png");
        ASSERT_TRUE(!img_rgb.empty());
        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
        computeBlockHistograms(cv::gpu::GpuMat(img));
-        compare_inner_parts(block_hists, descriptors.rowRange(2, 3));
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(2, 3)));

        img_rgb = readImage("hog/negative2.png");
        ASSERT_TRUE(!img_rgb.empty());
        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
        computeBlockHistograms(cv::gpu::GpuMat(img));
-        compare_inner_parts(block_hists, descriptors.rowRange(3, 4));
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(3, 4)));

        img_rgb = readImage("hog/positive3.png");
        ASSERT_TRUE(!img_rgb.empty());
        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
        computeBlockHistograms(cv::gpu::GpuMat(img));
-        compare_inner_parts(block_hists, descriptors.rowRange(4, 5));
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(4, 5)));

        img_rgb = readImage("hog/negative3.png");
        ASSERT_TRUE(!img_rgb.empty());
        cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
        computeBlockHistograms(cv::gpu::GpuMat(img));
-        compare_inner_parts(block_hists, descriptors.rowRange(5, 6));
+        compare_inner_parts(cv::Mat(block_hists), cv::Mat(descriptors.rowRange(5, 6)));
    }

    // Does not compare border value, as interpolation leads to delta
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@ -3897,7 +3897,7 @@ static void testC2C(const std::string& hint, int cols, int rows, int flags, bool
    EXPECT_TRUE(!inplace || d_b.ptr() == d_b_data.ptr());
    ASSERT_EQ(CV_32F, d_b.depth());
    ASSERT_EQ(2, d_b.channels());
-    EXPECT_MAT_NEAR(b_gold, d_b, rows * cols * 1e-4);
+    EXPECT_MAT_NEAR(b_gold, cv::Mat(d_b), rows * cols * 1e-4);
 }

 TEST_P(Dft, C2C)
--- a/samples/gpu/stereo_match.cpp
+++ b/samples/gpu/stereo_match.cpp
@ -206,7 +206,7 @@ void App::run()
        workEnd();

        // Show results
-        disp = d_disp;
+        d_disp.download(disp);
        putText(disp, text(), Point(5, 25), FONT_HERSHEY_SIMPLEX, 1.0, Scalar::all(255));
        imshow("disparity", disp);

--- a/samples/gpu/surf_keypoint_matcher.cpp
+++ b/samples/gpu/surf_keypoint_matcher.cpp
@ -71,7 +71,7 @@ int main(int argc, char* argv[])

    // drawing the results
    Mat img_matches;
-    drawMatches(img1, keypoints1, img2, keypoints2, matches, img_matches);
+    drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);
    
    namedWindow("matches", 0);
    imshow("matches", img_matches);