created wrappers for new NPP functions

removed void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream) - it fails with NPP_NOT_IMPLEMENTED error updated docs, accuracy and performance tests
2012-02-22 10:00:53 +00:00
parent e426dfc396
commit 2d30480982
37 changed files with 1984 additions and 566 deletions
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -48,6 +48,17 @@
 #ifdef HAVE_CUDA
    #include <cuda_runtime.h>
    #include <npp.h>
    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
    #define NPP_MINIMUM_REQUIRED_VERSION 4100
    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient Cuda Runtime library version, please update it."
    #endif
    #if (NPP_VERSION_MAJOR * 1000 + NPP_VERSION_MINOR * 100 + NPP_VERSION_BUILD < NPP_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient NPP version, please update it."
    #endif
 #endif
 using namespace std;
@@ -460,15 +471,17 @@ namespace cv { namespace gpu
 namespace
 {
    //////////////////////////////////////////////////////////////////////////
    // Convert
    template<int n> struct NPPTypeTraits;
    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
    template<> struct NPPTypeTraits<CV_8S>  { typedef Npp8s npp_type; };
    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
    template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
    template<> struct NPPTypeTraits<CV_64F> { typedef Npp64f npp_type; };
    //////////////////////////////////////////////////////////////////////////
    // Convert
    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
    {
@@ -494,6 +507,7 @@ namespace
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
            cudaSafeCall( cudaDeviceSynchronize() );
@@ -508,6 +522,7 @@ namespace
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
            cudaSafeCall( cudaDeviceSynchronize() );
@@ -529,6 +544,14 @@ namespace
        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
    };
    template<int SCN> struct NppSetFunc<CV_8S, SCN>
    {
        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
    };
    template<> struct NppSetFunc<CV_8S, 1>
    {
        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
    };
    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
    {
@@ -613,6 +636,35 @@ namespace
        }
    };
    //////////////////////////////////////////////////////////////////////////
    // CopyMasked
    template<int SDEPTH> struct NppCopyMaskedFunc
    {
        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
    };
    template<int SDEPTH, typename NppCopyMaskedFunc<SDEPTH>::func_ptr func> struct NppCopyMasked
    {
        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
        static void copyMasked(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t /*stream*/)
        {
            NppiSize sz;
            sz.width = src.cols;
            sz.height = src.rows;
            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
            cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
    //////////////////////////////////////////////////////////////////////////
    // CudaFuncTable
    class CudaFuncTable : public GpuFuncTable
    {
    public:
@@ -631,7 +683,26 @@ namespace
        void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
        {
-            ::cv::gpu::copyWithMask(src, dst, mask);
+            CV_Assert(src.size() == dst.size() && src.type() == dst.type());
            CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
            typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
            static const caller_t callers[7][4] = 
            {
                /*  8U */ {NppCopyMasked<CV_8U, nppiCopy_8u_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_8U, nppiCopy_8u_C3MR>::copyMasked, NppCopyMasked<CV_8U, nppiCopy_8u_C4MR>::copyMasked},
                /*  8S */ {cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask},
                /* 16U */ {NppCopyMasked<CV_16U, nppiCopy_16u_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_16U, nppiCopy_16u_C3MR>::copyMasked, NppCopyMasked<CV_16U, nppiCopy_16u_C4MR>::copyMasked},
                /* 16S */ {NppCopyMasked<CV_16S, nppiCopy_16s_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_16S, nppiCopy_16s_C3MR>::copyMasked, NppCopyMasked<CV_16S, nppiCopy_16s_C4MR>::copyMasked},
                /* 32S */ {NppCopyMasked<CV_32S, nppiCopy_32s_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_32S, nppiCopy_32s_C3MR>::copyMasked, NppCopyMasked<CV_32S, nppiCopy_32s_C4MR>::copyMasked},
                /* 32F */ {NppCopyMasked<CV_32F, nppiCopy_32f_C1MR>::copyMasked, cv::gpu::copyWithMask, NppCopyMasked<CV_32F, nppiCopy_32f_C3MR>::copyMasked, NppCopyMasked<CV_32F, nppiCopy_32f_C4MR>::copyMasked},
                /* 64F */ {cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask, cv::gpu::copyWithMask}
            };
            caller_t func =  mask.channels() == src.channels() ? callers[src.depth()][src.channels()] : cv::gpu::copyWithMask;
            CV_DbgAssert(func != 0);
            func(src, dst, mask, 0);
        }
        void convert(const GpuMat& src, GpuMat& dst) const
@@ -641,65 +712,65 @@ namespace
            {
                {
                    /*  8U ->  8U */ {0, 0, 0, 0},
-                    /*  8U ->  8S */ {::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo},
+                    /*  8U ->  8S */ {cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo, cv::gpu::convertTo},
-                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
+                    /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
-                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
+                    /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
-                    /*  8U -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8U -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /*  8U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /*  8U -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /*  8S ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S ->  8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /*  8S ->  8S */ {0,0,0,0},
-                    /*  8S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /*  8S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /*  8S -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /*  8S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /*  8S -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /*  8S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /*  8S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
+                    /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
-                    /* 16U ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 16U -> 16U */ {0,0,0,0},
-                    /* 16U -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 16U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 16U -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
+                    /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
-                    /* 16S ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 16S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 16S -> 16S */ {0,0,0,0},
-                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 16S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 16S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 32S ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S ->  8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32S ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 32S -> 32S */ {0,0,0,0},
-                    /* 32S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32S -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 32S -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32F ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 32F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 32F -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 32F -> 32F */ {0,0,0,0},
-                    /* 32F -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
+                    /* 32F -> 64F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo}
                },
                {
-                    /* 64F ->  8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F ->  8U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 64F ->  8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F ->  8S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 64F -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 16U */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 64F -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 16S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 64F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 32S */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
-                    /* 64F -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
+                    /* 64F -> 32F */ {cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo,cv::gpu::convertTo},
                    /* 64F -> 64F */ {0,0,0,0}
                }
            };
@@ -712,7 +783,7 @@ namespace
        void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
        {
-            ::cv::gpu::convertTo(src, dst, alpha, beta);
+            cv::gpu::convertTo(src, dst, alpha, beta);
        }
        void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
@@ -744,13 +815,13 @@ namespace
                typedef void (*caller_t)(GpuMat& src, Scalar s);
                static const caller_t callers[7][4] =
                {
-                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
+                    {NppSet<CV_8U, 1, nppiSet_8u_C1R>::set, cv::gpu::setTo, cv::gpu::setTo, NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
+                    {NppSet<CV_8S, 1, nppiSet_8s_C1R>::set, NppSet<CV_8S, 2, nppiSet_8s_C2R>::set, NppSet<CV_8S, 3, nppiSet_8s_C3R>::set, NppSet<CV_8S, 4, nppiSet_8s_C4R>::set},
-                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set, NppSet<CV_16U, 2, nppiSet_16u_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
+                    {NppSet<CV_16U, 1, nppiSet_16u_C1R>::set, NppSet<CV_16U, 2, nppiSet_16u_C2R>::set, cv::gpu::setTo, NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
-                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set, NppSet<CV_16S, 2, nppiSet_16s_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
+                    {NppSet<CV_16S, 1, nppiSet_16s_C1R>::set, NppSet<CV_16S, 2, nppiSet_16s_C2R>::set, cv::gpu::setTo, NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
-                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
+                    {NppSet<CV_32S, 1, nppiSet_32s_C1R>::set, cv::gpu::setTo, cv::gpu::setTo, NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
-                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
+                    {NppSet<CV_32F, 1, nppiSet_32f_C1R>::set, cv::gpu::setTo, cv::gpu::setTo, NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
+                    {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo}
                };
                callers[m.depth()][m.channels() - 1](m, s);
@@ -761,13 +832,13 @@ namespace
                static const caller_t callers[7][4] =
                {
-                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
+                    {NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
+                    {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo},
-                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
+                    {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
-                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
+                    {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
-                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
+                    {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
-                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
+                    {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set, cv::gpu::setTo, cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
-                    {::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
+                    {cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo, cv::gpu::setTo}
                };
                callers[m.depth()][m.channels() - 1](m, s, mask);
--- a/modules/gpu/doc/image_processing.rst
+++ b/modules/gpu/doc/image_processing.rst
@@ -69,18 +69,14 @@ Performs a mean-shift segmentation of the source image and eliminates small segm
 gpu::integral
 -----------------
-Computes an integral image and a squared integral image.
+Computes an integral image.
 .. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null())
 .. ocv:function:: void gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null())
    :param src: Source image. Only  ``CV_8UC1`` images are supported for now.
    :param sum: Integral image containing 32-bit unsigned integer values packed into  ``CV_32SC1`` .
    :param sqsum: Squared integral image of the  ``CV_32FC1`` type.
    :param stream: Stream for the asynchronous version.
 .. seealso:: :ocv:func:`integral`
@@ -380,6 +376,22 @@ Converts an image from one color space to another.
 gpu::swapChannels
 -----------------
 Exchanges the color channels of an image in-place.
 .. ocv:function:: void gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null())
    :param src: Source image. Supports only ``CV_8UC4`` type.
    :param dstOrder: Integer array describing how channel values are permutated. The n-th entry of the array contains the number of the channel that is stored in the n-th channel of the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR channel order.
    :param stream: Stream for the asynchronous version.
 The methods support arbitrary permutations of the original channels, including replication.
 gpu::threshold
 ------------------
 Applies a fixed-level threshold to each array element.
@@ -489,7 +501,7 @@ Rotates an image around the origin (0,0) and then shifts it.
 .. ocv:function:: void gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null())
-    :param src: Source image.  ``CV_8UC1`` and  ``CV_8UC4`` types are supported.
+    :param src: Source image. Supports 1, 3 or 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32F`` depth.
    :param dst: Destination image with the same type as  ``src`` . The size is  ``dsize`` .
@@ -751,6 +763,38 @@ Performs linear blending of two images.
 gpu::alphaComp
 -------------------
 Composites two images using alpha opacity values contained in each image.
 .. ocv:function:: void gpu::alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null())
    :param img1: First image. Supports ``CV_8UC4`` , ``CV_16UC4`` , ``CV_32SC4`` and ``CV_32FC4`` types.
    :param img1: Second image. Must have the same size and the same type as ``img1`` .
    :param dst: Destination image.
    :param alpha_op: Flag specifying the alpha-blending operation:
            * **ALPHA_OVER**
            * **ALPHA_IN** 
            * **ALPHA_OUT** 
            * **ALPHA_ATOP** 
            * **ALPHA_XOR** 
            * **ALPHA_PLUS** 
            * **ALPHA_OVER_PREMUL** 
            * **ALPHA_IN_PREMUL** 
            * **ALPHA_OUT_PREMUL** 
            * **ALPHA_ATOP_PREMUL** 
            * **ALPHA_XOR_PREMUL** 
            * **ALPHA_PLUS_PREMUL** 
            * **ALPHA_PREMUL**
    :param stream: Stream for the asynchronous version.
 gpu::Canny
 -------------------
 Finds edges in an image using the [Canny86]_ algorithm.
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@@ -10,6 +10,7 @@ gpu::meanStdDev
 Computes a mean value and a standard deviation of matrix elements.
 .. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev)
 .. ocv:function:: void gpu::meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
    :param mtx: Source matrix.  ``CV_8UC1``  matrices are supported for now.
@@ -17,6 +18,8 @@ Computes a mean value and a standard deviation of matrix elements.
    :param stddev: Standard deviation value.
    :param buf: Optional buffer to avoid extra memory allocations. It is resized automatically.
 .. seealso:: :ocv:func:`meanStdDev`
--- a/modules/gpu/doc/operations_on_matrices.rst
+++ b/modules/gpu/doc/operations_on_matrices.rst
@@ -63,7 +63,7 @@ Flips a 2D matrix around vertical, horizontal, or both axes.
 .. ocv:function:: void gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream = Stream::Null())
-    :param src: Source matrix. Only  ``CV_8UC1``  and  ``CV_8UC4``  matrices are supported for now.
+    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U``, ``CV_16U``, ``CV_32S`` or ``CV_32F`` depth.
    :param dst: Destination matrix.
--- a/modules/gpu/doc/per_element_operations.rst
+++ b/modules/gpu/doc/per_element_operations.rst
@@ -139,6 +139,50 @@ where ``I`` is a multi-dimensional index of array elements. In case of multi-cha
 gpu::abs
 ------------
 Computes an absolute value of each matrix element.
 .. ocv:function:: void gpu::abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. Supports ``CV_16S`` and ``CV_32F`` depth.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 .. seealso:: :ocv:func:`abs`
 gpu::sqr
 ------------
 Computes a square value of each matrix element.
 .. ocv:function:: void gpu::sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 gpu::sqrt
 ------------
 Computes a square root of each matrix element.
 .. ocv:function:: void gpu::sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 .. seealso:: :ocv:func:`sqrt`
 gpu::exp
@@ -147,7 +191,7 @@ Computes an exponent of each matrix element.
 .. ocv:function:: void gpu::exp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
-    :param src: Source matrix. ``CV_32FC1`` matrixes are supported for now.
+    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
    :param dst: Destination matrix with the same size and type as ``src`` .
@@ -157,6 +201,22 @@ Computes an exponent of each matrix element.
 gpu::log
 ------------
 Computes a natural logarithm of absolute value of each matrix element.
 .. ocv:function:: void gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. Supports ``CV_8U`` , ``CV_16U`` , ``CV_16S`` and ``CV_32F`` depth.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 .. seealso:: :ocv:func:`log`
 gpu::pow
 ------------
 Raises every matrix element to a power.
@@ -181,22 +241,6 @@ The function ``pow`` raises every element of the input matrix to ``p`` :
 gpu::log
 ------------
 Computes a natural logarithm of absolute value of each matrix element.
 .. ocv:function:: void gpu::log(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. ``CV_32FC1`` matrixes are supported for now.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 .. seealso:: :ocv:func:`log`
 gpu::absdiff
 ----------------
 Computes per-element absolute difference of two matrices (or of a matrix and scalar).
@@ -262,9 +306,10 @@ Performs a per-element bitwise inversion.
 gpu::bitwise_or
 -------------------
-Performs a per-element bitwise disjunction of two matrices.
+Performs a per-element bitwise disjunction of two matrices or of matrix and scalar.
 .. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
 .. ocv:function:: void gpu::bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
    :param src1: First source matrix.
@@ -280,9 +325,10 @@ Performs a per-element bitwise disjunction of two matrices.
 gpu::bitwise_and
 --------------------
-Performs a per-element bitwise conjunction of two matrices.
+Performs a per-element bitwise conjunction of two matrices or of matrix and scalar.
 .. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
 .. ocv:function:: void gpu::bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
    :param src1: First source matrix.
@@ -298,9 +344,10 @@ Performs a per-element bitwise conjunction of two matrices.
 gpu::bitwise_xor
 --------------------
-Performs a per-element bitwise ``exclusive or`` operation of two matrices.
+Performs a per-element bitwise ``exclusive or`` operation of two matrices of matrix and scalar.
 .. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null())
 .. ocv:function:: void gpu::bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
    :param src1: First source matrix.
@@ -314,6 +361,38 @@ Performs a per-element bitwise ``exclusive or`` operation of two matrices.
 gpu::rshift
 --------------------
 Performs pixel by pixel right shift of an image by a constant value.
 .. ocv:function:: void gpu::rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. Supports 1, 3 and 4 channels images with integers elements.
    :param sc: Constant values, one per channel.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 gpu::lshift
 --------------------
 Performs pixel by pixel right left of an image by a constant value.
 .. ocv:function:: void gpu::lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null())
    :param src: Source matrix. Supports 1, 3 and 4 channels images with ``CV_8U`` , ``CV_16U`` or ``CV_32S`` depth.
    :param sc: Constant values, one per channel.
    :param dst: Destination matrix with the same size and type as ``src`` .
    :param stream: Stream for the asynchronous version.
 gpu::min
 ------------
 Computes the per-element minimum of two matrices (or a matrix and a scalar).
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -498,7 +498,7 @@ CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha,
 CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null());
 //! reverses the order of the rows, columns or both in a matrix
-//! supports CV_8UC1, CV_8UC4 types
+//! supports 1, 3 and 4 channels images with CV_8U, CV_16U, CV_32S or CV_32F depth
 CV_EXPORTS void flip(const GpuMat& a, GpuMat& b, int flipCode, Stream& stream = Stream::Null());
 //! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
@@ -586,20 +586,32 @@ CV_EXPORTS void absdiff(const GpuMat& a, const GpuMat& b, GpuMat& c, Stream& str
 //! computes element-wise absolute difference of array and scalar (c = abs(a - s))
 CV_EXPORTS void absdiff(const GpuMat& a, const Scalar& s, GpuMat& c, Stream& stream = Stream::Null());
 //! computes absolute value of each matrix element
 //! supports CV_16S and CV_32F depth
 CV_EXPORTS void abs(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
 //! computes square of each pixel in an image
 //! supports CV_8U, CV_16U, CV_16S and CV_32F depth
 CV_EXPORTS void sqr(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
 //! computes square root of each pixel in an image
 //! supports CV_8U, CV_16U, CV_16S and CV_32F depth
 CV_EXPORTS void sqrt(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
 //! computes exponent of each matrix element (b = e**a)
-//! supports only CV_32FC1 type
+//! supports CV_8U, CV_16U, CV_16S and CV_32F depth
 CV_EXPORTS void exp(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
 //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
 //! supports CV_8U, CV_16U, CV_16S and CV_32F depth
 CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
 //! computes power of each matrix element:
 //    (dst(i,j) = pow(     src(i,j) , power), if src.type() is integer
 //    (dst(i,j) = pow(fabs(src(i,j)), power), otherwise
 //! supports all, except depth == CV_64F
 CV_EXPORTS void pow(const GpuMat& src, double power, GpuMat& dst, Stream& stream = Stream::Null());
 //! computes natural logarithm of absolute value of each matrix element: b = log(abs(a))
 //! supports only CV_32FC1 type
 CV_EXPORTS void log(const GpuMat& a, GpuMat& b, Stream& stream = Stream::Null());
 //! compares elements of two arrays (c = a <cmpop> b)
 CV_EXPORTS void compare(const GpuMat& a, const GpuMat& b, GpuMat& c, int cmpop, Stream& stream = Stream::Null());
@@ -608,12 +620,29 @@ CV_EXPORTS void bitwise_not(const GpuMat& src, GpuMat& dst, const GpuMat& mask=G
 //! calculates per-element bit-wise disjunction of two arrays
 CV_EXPORTS void bitwise_or(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
 //! calculates per-element bit-wise disjunction of array and scalar
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
 CV_EXPORTS void bitwise_or(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
 //! calculates per-element bit-wise conjunction of two arrays
 CV_EXPORTS void bitwise_and(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
 //! calculates per-element bit-wise conjunction of array and scalar
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
 CV_EXPORTS void bitwise_and(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
 //! calculates per-element bit-wise "exclusive or" operation
 CV_EXPORTS void bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const GpuMat& mask=GpuMat(), Stream& stream = Stream::Null());
 //! calculates per-element bit-wise "exclusive or" of array and scalar
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
 CV_EXPORTS void bitwise_xor(const GpuMat& src1, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
 //! pixel by pixel right shift of an image by a constant value
 //! supports 1, 3 and 4 channels images with integers elements
 CV_EXPORTS void rshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
 //! pixel by pixel left shift of an image by a constant value
 //! supports 1, 3 and 4 channels images with CV_8U, CV_16U or CV_32S depth
 CV_EXPORTS void lshift(const GpuMat& src, const Scalar& sc, GpuMat& dst, Stream& stream = Stream::Null());
 //! computes per-element minimum of two arrays (dst = min(src1, src2))
 CV_EXPORTS void min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream& stream = Stream::Null());
@@ -627,6 +656,13 @@ CV_EXPORTS void max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, Stream&
 //! computes per-element maximum of array and scalar (dst = max(src1, src2))
 CV_EXPORTS void max(const GpuMat& src1, double src2, GpuMat& dst, Stream& stream = Stream::Null());
 enum { ALPHA_OVER, ALPHA_IN, ALPHA_OUT, ALPHA_ATOP, ALPHA_XOR, ALPHA_PLUS, ALPHA_OVER_PREMUL, ALPHA_IN_PREMUL, ALPHA_OUT_PREMUL,
       ALPHA_ATOP_PREMUL, ALPHA_XOR_PREMUL, ALPHA_PLUS_PREMUL, ALPHA_PREMUL};
 //! Composite two images using alpha opacity values contained in each image
 //! Supports CV_8UC4, CV_16UC4, CV_32SC4 and CV_32FC4 types
 CV_EXPORTS void alphaComp(const GpuMat& img1, const GpuMat& img2, GpuMat& dst, int alpha_op, Stream& stream = Stream::Null());
 ////////////////////////////// Image processing //////////////////////////////
@@ -665,6 +701,13 @@ CV_EXPORTS void reprojectImageTo3D(const GpuMat& disp, GpuMat& xyzw, const Mat&
 //! converts image from one color space to another
 CV_EXPORTS void cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn = 0, Stream& stream = Stream::Null());
 //! swap channels
 //! dstOrder - Integer array describing how channel values are permutated. The n-th entry
 //!            of the array contains the number of the channel that is stored in the n-th channel of
 //!            the output image. E.g. Given an RGBA image, aDstOrder = [3,2,1,0] converts this to ABGR
 //!            channel order.
 CV_EXPORTS void swapChannels(GpuMat& image, const int dstOrder[4], Stream& stream = Stream::Null());
 //! applies fixed threshold to the image
 CV_EXPORTS double threshold(const GpuMat& src, GpuMat& dst, double thresh, double maxval, int type, Stream& stream = Stream::Null());
@@ -692,9 +735,9 @@ CV_EXPORTS void buildWarpCylindricalMaps(Size src_size, Rect dst_roi, const Mat
 CV_EXPORTS void buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K, const Mat& R, float scale,
                                       GpuMat& map_x, GpuMat& map_y, Stream& stream = Stream::Null());
-//! rotate 8bit single or four channel image
+//! rotates an image around the origin (0,0) and then shifts it
-//! Supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
+//! supports INTER_NEAREST, INTER_LINEAR, INTER_CUBIC
-//! supports CV_8UC1, CV_8UC4 types
+//! supports 1, 3 or 4 channels images with CV_8U, CV_16U or CV_32F depth
 CV_EXPORTS void rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift = 0, double yShift = 0, 
                       int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
@@ -706,15 +749,9 @@ CV_EXPORTS void copyMakeBorder(const GpuMat& src, GpuMat& dst, int top, int bott
 //! sum will have CV_32S type, but will contain unsigned int values
 //! supports only CV_8UC1 source type
 CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, Stream& stream = Stream::Null());
 //! buffered version
 CV_EXPORTS void integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, Stream& stream = Stream::Null());
 //! computes the integral image and integral for the squared image
 //! sum will have CV_32S type, sqsum - CV32F type
 //! supports only CV_8UC1 source type
 CV_EXPORTS void integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& stream = Stream::Null());
 //! computes squared integral image
 //! result matrix will have 64F type, but will contain 64U values
 //! supports source images of 8UC1 type only
@@ -859,6 +896,8 @@ private:
 //! computes mean value and standard deviation of all or selected array elements
 //! supports only CV_8UC1 type
 CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev);
 //! buffered version
 CV_EXPORTS void meanStdDev(const GpuMat& mtx, Scalar& mean, Scalar& stddev, GpuMat& buf);
 //! computes norm of array
 //! supports NORM_INF, NORM_L1, NORM_L2
@@ -939,10 +978,16 @@ CV_EXPORTS void solvePnPRansac(const Mat& object, const Mat& image, const Mat& c
 //////////////////////////////// Image Labeling ////////////////////////////////
-//!performs labeling via graph cuts
+//!performs labeling via graph cuts of a 2D regular 4-connected graph. 
 CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, 
                         GpuMat& buf, Stream& stream = Stream::Null());
 //!performs labeling via graph cuts of a 2D regular 8-connected graph. 
 CV_EXPORTS void graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, 
                         GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight,
                         GpuMat& labels, 
                         GpuMat& buf, Stream& stream = Stream::Null());
 ////////////////////////////////// Histograms //////////////////////////////////
 //! Compute levels with even distribution. levels will have 1 row and nLevels cols and CV_32SC1 type.
--- a/modules/gpu/perf/perf_arithm.cpp
+++ b/modules/gpu/perf/perf_arithm.cpp
@@ -59,7 +59,7 @@ GPU_PERF_TEST(Flip, cv::gpu::DeviceInfo, cv::Size, perf::MatType, FlipCode)
 INSTANTIATE_TEST_CASE_P(Arithm, Flip, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4),
+                        testing::Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4),
                        testing::Values((int) HORIZONTAL_AXIS, (int) VERTICAL_AXIS, (int) BOTH_AXIS)));
 //////////////////////////////////////////////////////////////////////
@@ -363,6 +363,33 @@ INSTANTIATE_TEST_CASE_P(Arithm, BitwiseAnd, testing::Combine(
                        GPU_TYPICAL_MAT_SIZES, 
                        testing::Values(CV_8UC1, CV_16UC1, CV_32SC1)));
 GPU_PERF_TEST(BitwiseScalarAnd, cv::gpu::DeviceInfo, cv::Size, perf::MatType)
 {
    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::Size size = GET_PARAM(1);
    int type = GET_PARAM(2);
    cv::gpu::setDevice(devInfo.deviceID());
    cv::Mat src_host(size, type);
    declare.in(src_host, WARMUP_RNG);
    cv::gpu::GpuMat src(src_host);
    cv::gpu::GpuMat dst;
    cv::Scalar sc = cv::Scalar(123, 123, 123, 123);
    TEST_CYCLE()
    {
        cv::gpu::bitwise_and(src, sc, dst);
    }
 }
 INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalarAnd, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES, 
                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4)));
 //////////////////////////////////////////////////////////////////////
 // Min
@@ -411,10 +438,11 @@ GPU_PERF_TEST(MeanStdDev, cv::gpu::DeviceInfo, cv::Size)
    cv::gpu::GpuMat src(src_host); 
    cv::Scalar mean;
    cv::Scalar stddev;
    cv::gpu::GpuMat buf;
    TEST_CYCLE()
    {
-        cv::gpu::meanStdDev(src, mean, stddev);
+        cv::gpu::meanStdDev(src, mean, stddev, buf);
    }
 }
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -201,7 +201,7 @@ GPU_PERF_TEST(CvtColor, cv::gpu::DeviceInfo, cv::Size, perf::MatType, CvtColorIn
    declare.in(src_host, WARMUP_RNG);
    cv::gpu::GpuMat src(src_host);
-    cv::gpu::GpuMat dst(size, CV_MAKETYPE(type, info.dcn));
+    cv::gpu::GpuMat dst;
    TEST_CYCLE()
    {
@@ -218,6 +218,32 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
                            CvtColorInfo(4, 4, cv::COLOR_BGR2XYZ), CvtColorInfo(4, 4, cv::COLOR_BGR2YCrCb), CvtColorInfo(4, 4, cv::COLOR_YCrCb2BGR), 
                            CvtColorInfo(4, 4, cv::COLOR_BGR2HSV), CvtColorInfo(4, 4, cv::COLOR_HSV2BGR))));
 //////////////////////////////////////////////////////////////////////
 // SwapChannels
 GPU_PERF_TEST(SwapChannels, cv::gpu::DeviceInfo, cv::Size)
 {
    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::Size size = GET_PARAM(1);
    cv::gpu::setDevice(devInfo.deviceID());
    cv::Mat src_host(size, CV_8UC4);
    declare.in(src_host, WARMUP_RNG);
    cv::gpu::GpuMat src(src_host);
    const int dstOrder[] = {2, 1, 0, 3};
    TEST_CYCLE()
    {
        cv::gpu::swapChannels(src, dstOrder);
    }
 }
 INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, testing::Combine(ALL_DEVICES, GPU_TYPICAL_MAT_SIZES));
 //////////////////////////////////////////////////////////////////////
 // Threshold
@@ -457,7 +483,7 @@ GPU_PERF_TEST(Rotate, cv::gpu::DeviceInfo, cv::Size, perf::MatType, Interpolatio
 INSTANTIATE_TEST_CASE_P(ImgProc, Rotate, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES, 
-                        testing::Values(CV_8UC1, CV_8UC4),
+                        testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        testing::Values((int) cv::INTER_NEAREST, (int) cv::INTER_LINEAR, (int) cv::INTER_CUBIC)));
 //////////////////////////////////////////////////////////////////////
@@ -519,33 +545,6 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Integral, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES));
 //////////////////////////////////////////////////////////////////////
 // IntegralBoth
 GPU_PERF_TEST(IntegralBoth, cv::gpu::DeviceInfo, cv::Size)
 {
    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::Size size = GET_PARAM(1);
    cv::gpu::setDevice(devInfo.deviceID());
    cv::Mat src_host(size, CV_8UC1);
    declare.in(src_host, WARMUP_RNG);
    cv::gpu::GpuMat src(src_host);
    cv::gpu::GpuMat sum, sqsum;
    TEST_CYCLE()
    {
        cv::gpu::integral(src, sum, sqsum);
    }
 }
 INSTANTIATE_TEST_CASE_P(ImgProc, IntegralBoth, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES));
 //////////////////////////////////////////////////////////////////////
 // IntegralSqr
@@ -849,6 +848,39 @@ INSTANTIATE_TEST_CASE_P(ImgProc, BlendLinear, testing::Combine(
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC1, CV_32FC1)));
 //////////////////////////////////////////////////////////////////////
 // AlphaComp
 GPU_PERF_TEST(AlphaComp, cv::gpu::DeviceInfo, cv::Size, perf::MatType, AlphaOp)
 {
    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::Size size = GET_PARAM(1);
    int type = GET_PARAM(2);
    int alpha_op = GET_PARAM(3);
    cv::gpu::setDevice(devInfo.deviceID());
    cv::Mat img1_host(size, type);
    cv::Mat img2_host(size, type);
    declare.in(img1_host, img2_host, WARMUP_RNG);
    cv::gpu::GpuMat img1(img1_host);
    cv::gpu::GpuMat img2(img2_host);
    cv::gpu::GpuMat dst;
    TEST_CYCLE()
    {
        cv::gpu::alphaComp(img1, img2, dst, alpha_op);
    }
 }
 INSTANTIATE_TEST_CASE_P(ImgProc, AlphaComp, testing::Combine(
                        ALL_DEVICES, 
                        GPU_TYPICAL_MAT_SIZES,
                        testing::Values(CV_8UC4, CV_16UC4, CV_32SC4, CV_32FC4),
                        testing::Values((int)cv::gpu::ALPHA_OVER, (int)cv::gpu::ALPHA_IN, (int)cv::gpu::ALPHA_OUT, (int)cv::gpu::ALPHA_ATOP, (int)cv::gpu::ALPHA_XOR, (int)cv::gpu::ALPHA_PLUS, (int)cv::gpu::ALPHA_OVER_PREMUL, (int)cv::gpu::ALPHA_IN_PREMUL, (int)cv::gpu::ALPHA_OUT_PREMUL, (int)cv::gpu::ALPHA_ATOP_PREMUL, (int)cv::gpu::ALPHA_XOR_PREMUL, (int)cv::gpu::ALPHA_PLUS_PREMUL, (int)cv::gpu::ALPHA_PREMUL)));
 //////////////////////////////////////////////////////////////////////
 // Canny
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@@ -11,7 +11,7 @@ int main(int argc, char **argv)
 #else
-int main(int argc, char** argv)
+int main()
 {
    printf("OpenCV was built without CUDA support\n");
    return 0;
--- a/modules/gpu/perf/perf_utility.hpp
+++ b/modules/gpu/perf/perf_utility.hpp
@@ -11,6 +11,7 @@ CV_ENUM(FlipCode, HORIZONTAL_AXIS, VERTICAL_AXIS, BOTH_AXIS)
 CV_ENUM(Interpolation, cv::INTER_NEAREST, cv::INTER_LINEAR, cv::INTER_CUBIC)
 CV_ENUM(MatchMethod, cv::TM_SQDIFF, cv::TM_SQDIFF_NORMED, cv::TM_CCORR, cv::TM_CCORR_NORMED, cv::TM_CCOEFF, cv::TM_CCOEFF_NORMED)
 CV_ENUM(NormType, cv::NORM_INF, cv::NORM_L1, cv::NORM_L2)
 CV_ENUM(AlphaOp, cv::gpu::ALPHA_OVER, cv::gpu::ALPHA_IN, cv::gpu::ALPHA_OUT, cv::gpu::ALPHA_ATOP, cv::gpu::ALPHA_XOR, cv::gpu::ALPHA_PLUS, cv::gpu::ALPHA_OVER_PREMUL, cv::gpu::ALPHA_IN_PREMUL, cv::gpu::ALPHA_OUT_PREMUL, cv::gpu::ALPHA_ATOP_PREMUL, cv::gpu::ALPHA_XOR_PREMUL, cv::gpu::ALPHA_PLUS_PREMUL, cv::gpu::ALPHA_PREMUL)
 struct CvtColorInfo
 {
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -52,8 +52,6 @@ void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double,
 void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::log(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::magnitude(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::magnitudeSqr(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::magnitude(const GpuMat&, const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
@@ -89,9 +87,9 @@ void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const G
    CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2);
    CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type()));
-    bool tr1 = flags & GEMM_1_T;
+    bool tr1 = (flags & GEMM_1_T) != 0;
-    bool tr2 = flags & GEMM_2_T;
+    bool tr2 = (flags & GEMM_2_T) != 0;
-    bool tr3 = flags & GEMM_3_T;
+    bool tr3 = (flags & GEMM_3_T) != 0;
    Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size();
    Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size();
@@ -243,35 +241,66 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst, Stream& s)
 ////////////////////////////////////////////////////////////////////////
 // flip
-void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& s)
+namespace
 {
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+    template<int DEPTH> struct NppTypeTraits;
    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
-    dst.create( src.size(), src.type() );
+    template <int DEPTH> struct NppMirrorFunc
    NppiSize sz;
    sz.width  = src.cols;
    sz.height = src.rows;
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    if (src.type() == CV_8UC1)
    {
-        nppSafeCall( nppiMirror_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
+        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
            (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
    }
    else
    {
        nppSafeCall( nppiMirror_8u_C4R(src.ptr<Npp8u>(), static_cast<int>(src.step),
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
            (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
    }
-    if (stream == 0)
+        typedef NppStatus (*func_t)(const npp_t* pSrc, int nSrcStep, npp_t* pDst, int nDstStep, NppiSize oROI, NppiAxis flip);
-        cudaSafeCall( cudaDeviceSynchronize() );
+    };
    template <int DEPTH, typename NppMirrorFunc<DEPTH>::func_t func> struct NppMirror
    {
        typedef typename NppMirrorFunc<DEPTH>::npp_t npp_t;
        static void call(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream)
        {
            NppStreamHandler h(stream);
            NppiSize sz;
            sz.width  = src.cols;
            sz.height = src.rows;
            nppSafeCall( func(src.ptr<npp_t>(), static_cast<int>(src.step),
                dst.ptr<npp_t>(), static_cast<int>(dst.step), sz,
                (flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
 }
 void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, int flipCode, cudaStream_t stream);
    static const func_t funcs[6][4] = 
    {
        {NppMirror<CV_8U, nppiMirror_8u_C1R>::call, 0, NppMirror<CV_8U, nppiMirror_8u_C3R>::call, NppMirror<CV_8U, nppiMirror_8u_C4R>::call},
        {0,0,0,0},
        {NppMirror<CV_16U, nppiMirror_16u_C1R>::call, 0, NppMirror<CV_16U, nppiMirror_16u_C3R>::call, NppMirror<CV_16U, nppiMirror_16u_C4R>::call},
        {0,0,0,0},
        {NppMirror<CV_32S, nppiMirror_32s_C1R>::call, 0, NppMirror<CV_32S, nppiMirror_32s_C3R>::call, NppMirror<CV_32S, nppiMirror_32s_C4R>::call},
        {NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
    };
    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
    dst.create(src.size(), src.type());
    funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
 }
 ////////////////////////////////////////////////////////////////////////
@@ -340,52 +369,6 @@ void cv::gpu::LUT(const GpuMat& src, const Mat& lut, GpuMat& dst, Stream& s)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 ////////////////////////////////////////////////////////////////////////
 // exp
 void cv::gpu::exp(const GpuMat& src, GpuMat& dst, Stream& s)
 {
    CV_Assert(src.type() == CV_32FC1);
    dst.create(src.size(), src.type());
    NppiSize sz;
    sz.width = src.cols;
    sz.height = src.rows;
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    nppSafeCall( nppiExp_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 ////////////////////////////////////////////////////////////////////////
 // log
 void cv::gpu::log(const GpuMat& src, GpuMat& dst, Stream& s)
 {
    CV_Assert(src.type() == CV_32FC1);
    dst.create(src.size(), src.type());
    NppiSize sz;
    sz.width = src.cols;
    sz.height = src.rows;
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    nppSafeCall( nppiLn_32f_C1R(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz) );
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 ////////////////////////////////////////////////////////////////////////
 // NPP magnitide
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -48,6 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
 void cv::gpu::cvtColor(const GpuMat&, GpuMat&, int, int, Stream&) { throw_nogpu(); }
 void cv::gpu::swapChannels(GpuMat&, const int[], Stream&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
@@ -1423,4 +1424,19 @@ void cv::gpu::cvtColor(const GpuMat& src, GpuMat& dst, int code, int dcn, Stream
    func(src, dst, dcn, stream);
 }
 void cv::gpu::swapChannels(GpuMat& image, const int dstOrder[4], Stream& s)
 {
    CV_Assert(image.type() == CV_8UC4);
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    NppiSize sz;
    sz.width  = image.cols;
    sz.height = image.rows;
    nppSafeCall( nppiSwapChannels_8u_C4IR(image.ptr<Npp8u>(), static_cast<int>(image.step), sz, dstOrder) );	
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -209,7 +209,7 @@ namespace cv { namespace gpu { namespace device
                cv::gpu::error("Unsupported channels count", __FILE__, __LINE__, "bilateral_filter_caller");
            }
-            if (stream != 0)
+            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@@ -1104,9 +1104,9 @@ namespace cv { namespace gpu { namespace device
        cv::gpu::device::transform((DevMem2D_<T>)src1, (DevMem2D_<T>)dst, op, WithOutMask(), stream);
    }
-    template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
+    //template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
    template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
-    template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
+    //template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
    template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
    template void absdiff_gpu<int   >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
    //template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);                  
--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -50,7 +50,7 @@ using namespace cv::gpu;
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>&, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat& buf) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, int, int, int, GpuMat&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
@@ -45,12 +45,41 @@
 #if !defined (HAVE_CUDA)
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::graphcut(GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
 namespace
 {
    typedef NppStatus (*init_func_t)(NppiSize oSize, NppiGraphcutState** ppState, Npp8u* pDeviceMem);
    class NppiGraphcutStateHandler
    {
    public:
        NppiGraphcutStateHandler(NppiSize sznpp, Npp8u* pDeviceMem, const init_func_t func)
        {
            nppSafeCall( func(sznpp, &pState, pDeviceMem) );
        }
        ~NppiGraphcutStateHandler()
        {
            nppSafeCall( nppiGraphcutFree(pState) );
        }
        operator NppiGraphcutState*()
        {
            return pState;
        }
    private:        
        NppiGraphcutState* pState;
    };
 }
 void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& bottom, GpuMat& labels, GpuMat& buf, Stream& s)
 {
    Size src_size = terminals.size();
    CV_Assert(terminals.type() == CV_32S);
    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(leftTransp.type() == CV_32S);
@@ -70,30 +99,76 @@ void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTrans
    int bufsz;
    nppSafeCall( nppiGraphcutGetSize(sznpp, &bufsz) );
-    if ((size_t)bufsz > buf.cols * buf.rows * buf.elemSize())
+    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
        buf.create(1, bufsz, CV_8U);
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
-#if CUDART_VERSION > 4000 
+    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcutInitAlloc);
    NppiGraphcutState* pState;
    nppSafeCall( nppiGraphcutInitAlloc(sznpp, &pState, buf.ptr<Npp8u>()) );
    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
-        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), pState) );
+        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    nppSafeCall( nppiGraphcutFree(pState) );
 #else
    nppSafeCall( nppiGraphcut_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), top.ptr<Npp32s>(), bottom.ptr<Npp32s>(),
        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), buf.ptr<Npp8u>()) );
 #endif
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 void cv::gpu::graphcut(GpuMat& terminals, GpuMat& leftTransp, GpuMat& rightTransp, GpuMat& top, GpuMat& topLeft, GpuMat& topRight, 
              GpuMat& bottom, GpuMat& bottomLeft, GpuMat& bottomRight, GpuMat& labels, GpuMat& buf, Stream& s)
 {
    Size src_size = terminals.size();
    CV_Assert(terminals.type() == CV_32S);
    CV_Assert(leftTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(leftTransp.type() == CV_32S);
    CV_Assert(rightTransp.size() == Size(src_size.height, src_size.width));
    CV_Assert(rightTransp.type() == CV_32S);
    CV_Assert(top.size() == src_size);
    CV_Assert(top.type() == CV_32S);
    CV_Assert(topLeft.size() == src_size);
    CV_Assert(topLeft.type() == CV_32S);
    CV_Assert(topRight.size() == src_size);
    CV_Assert(topRight.type() == CV_32S);
    CV_Assert(bottom.size() == src_size);
    CV_Assert(bottom.type() == CV_32S);
    CV_Assert(bottomLeft.size() == src_size);
    CV_Assert(bottomLeft.type() == CV_32S);
    CV_Assert(bottomRight.size() == src_size);
    CV_Assert(bottomRight.type() == CV_32S);
    labels.create(src_size, CV_8U);
    NppiSize sznpp;
    sznpp.width = src_size.width;
    sznpp.height = src_size.height;
    int bufsz;
    nppSafeCall( nppiGraphcut8GetSize(sznpp, &bufsz) );
    ensureSizeIsEnough(1, bufsz, CV_8U, buf);
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    NppiGraphcutStateHandler state(sznpp, buf.ptr<Npp8u>(), nppiGraphcut8InitAlloc);
    nppSafeCall( nppiGraphcut8_32s8u(terminals.ptr<Npp32s>(), leftTransp.ptr<Npp32s>(), rightTransp.ptr<Npp32s>(), 
        top.ptr<Npp32s>(), topLeft.ptr<Npp32s>(), topRight.ptr<Npp32s>(),
        bottom.ptr<Npp32s>(), bottomLeft.ptr<Npp32s>(), bottomRight.ptr<Npp32s>(),
        static_cast<int>(terminals.step), static_cast<int>(leftTransp.step), sznpp, labels.ptr<Npp8u>(), static_cast<int>(labels.step), state) );
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -62,7 +62,6 @@ void cv::gpu::buildWarpSphericalMaps(Size, Rect, const Mat&, const Mat&, float,
 void cv::gpu::rotate(const GpuMat&, GpuMat&, Size, double, double, double, int, Stream&) { throw_nogpu(); }
 void cv::gpu::integral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::integralBuffered(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::integral(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::sqrIntegral(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::columnSum(const GpuMat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::rectStdDev(const GpuMat&, const GpuMat&, GpuMat&, const Rect&, Stream&) { throw_nogpu(); }
@@ -91,7 +90,7 @@ void cv::gpu::mulAndScaleSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, f
 void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int, Stream&) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
-void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream& stream) { throw_nogpu(); }
+void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
@@ -780,44 +779,78 @@ void cv::gpu::buildWarpSphericalMaps(Size src_size, Rect dst_roi, const Mat &K,
 ////////////////////////////////////////////////////////////////////////
 // rotate
-void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& s)
+namespace
-{
+{    
-    static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
+    template<int DEPTH> struct NppTypeTraits;
    template<> struct NppTypeTraits<CV_8U>  { typedef Npp8u npp_t; };
    template<> struct NppTypeTraits<CV_8S>  { typedef Npp8s npp_t; };
    template<> struct NppTypeTraits<CV_16U> { typedef Npp16u npp_t; };
    template<> struct NppTypeTraits<CV_16S> { typedef Npp16s npp_t; };
    template<> struct NppTypeTraits<CV_32S> { typedef Npp32s npp_t; };
    template<> struct NppTypeTraits<CV_32F> { typedef Npp32f npp_t; };
    template<> struct NppTypeTraits<CV_64F> { typedef Npp64f npp_t; };
-    CV_Assert(src.type() == CV_8UC1 || src.type() == CV_8UC4);
+    template <int DEPTH> struct NppRotateFunc
    {
        typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;
        typedef NppStatus (*func_t)(const npp_t* pSrc, NppiSize oSrcSize, int nSrcStep, NppiRect oSrcROI, 
                                    npp_t* pDst, int nDstStep, NppiRect oDstROI,
                                    double nAngle, double nShiftX, double nShiftY, int eInterpolation);
    };
    template <int DEPTH, typename NppRotateFunc<DEPTH>::func_t func> struct NppRotate
    {
        typedef typename NppRotateFunc<DEPTH>::npp_t npp_t;
        static void call(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream)
        {
            static const int npp_inter[] = {NPPI_INTER_NN, NPPI_INTER_LINEAR, NPPI_INTER_CUBIC};
            NppStreamHandler h(stream);
            NppiSize srcsz;
            srcsz.height = src.rows;
            srcsz.width = src.cols;
            NppiRect srcroi;
            srcroi.x = srcroi.y = 0;
            srcroi.height = src.rows;
            srcroi.width = src.cols;
            NppiRect dstroi;
            dstroi.x = dstroi.y = 0;
            dstroi.height = dst.rows;
            dstroi.width = dst.cols;
            nppSafeCall( func(src.ptr<npp_t>(), srcsz, static_cast<int>(src.step), srcroi,
                dst.ptr<npp_t>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
    };
 }
 void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, Stream& stream)
 {
    typedef void (*func_t)(const GpuMat& src, GpuMat& dst, Size dsize, double angle, double xShift, double yShift, int interpolation, cudaStream_t stream);
    static const func_t funcs[6][4] = 
    {
        {NppRotate<CV_8U, nppiRotate_8u_C1R>::call, 0, NppRotate<CV_8U, nppiRotate_8u_C3R>::call, NppRotate<CV_8U, nppiRotate_8u_C4R>::call},
        {0,0,0,0},
        {NppRotate<CV_16U, nppiRotate_16u_C1R>::call, 0, NppRotate<CV_16U, nppiRotate_16u_C3R>::call, NppRotate<CV_16U, nppiRotate_16u_C4R>::call},
        {0,0,0,0},
        {0,0,0,0},
        {NppRotate<CV_32F, nppiRotate_32f_C1R>::call, 0, NppRotate<CV_32F, nppiRotate_32f_C3R>::call, NppRotate<CV_32F, nppiRotate_32f_C4R>::call}
    };
    CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32F);
    CV_Assert(src.channels() == 1 || src.channels() == 3 || src.channels() == 4);
    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC);
    dst.create(dsize, src.type());
-    NppiSize srcsz;
+    funcs[src.depth()][src.channels() - 1](src, dst, dsize, angle, xShift, yShift, interpolation, StreamAccessor::getStream(stream));
    srcsz.height = src.rows;
    srcsz.width = src.cols;
    NppiRect srcroi;
    srcroi.x = srcroi.y = 0;
    srcroi.height = src.rows;
    srcroi.width = src.cols;
    NppiRect dstroi;
    dstroi.x = dstroi.y = 0;
    dstroi.height = dst.rows;
    dstroi.width = dst.cols;
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    if (src.type() == CV_8UC1)
    {
        nppSafeCall( nppiRotate_8u_C1R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
    }
    else
    {
        nppSafeCall( nppiRotate_8u_C4R(src.ptr<Npp8u>(), srcsz, static_cast<int>(src.step), srcroi,
            dst.ptr<Npp8u>(), static_cast<int>(dst.step), dstroi, angle, xShift, yShift, npp_inter[interpolation]) );
    }
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 ////////////////////////////////////////////////////////////////////////
@@ -857,30 +890,6 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer, S
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 void cv::gpu::integral(const GpuMat& src, GpuMat& sum, GpuMat& sqsum, Stream& s)
 {
    CV_Assert(src.type() == CV_8UC1);
    int width = src.cols + 1, height = src.rows + 1;
    sum.create(height, width, CV_32S);
    sqsum.create(height, width, CV_32F);
    NppiSize sz;
    sz.width = src.cols;
    sz.height = src.rows;
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    nppSafeCall( nppiSqrIntegral_8u32s32f_C1R(const_cast<Npp8u*>(src.ptr<Npp8u>()), static_cast<int>(src.step), 
        sum.ptr<Npp32s>(), static_cast<int>(sum.step), sqsum.ptr<Npp32f>(), static_cast<int>(sqsum.step), sz, 0, 0.0f, height) );
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 }
 //////////////////////////////////////////////////////////////////////////////
 // sqrIntegral
@@ -935,7 +944,6 @@ void cv::gpu::columnSum(const GpuMat& src, GpuMat& dst)
 void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, const Rect& rect, Stream& s)
 {
 #if CUDART_VERSION > 4000 
    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_64FC1);
    dst.create(src.size(), CV_32FC1);
@@ -959,31 +967,6 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 #else
    CV_Assert(src.type() == CV_32SC1 && sqr.type() == CV_32FC1);
    dst.create(src.size(), CV_32FC1);
    NppiSize sz;
    sz.width = src.cols;
    sz.height = src.rows;
    NppiRect nppRect;
    nppRect.height = rect.height;
    nppRect.width = rect.width;
    nppRect.x = rect.x;
    nppRect.y = rect.y;
    cudaStream_t stream = StreamAccessor::getStream(s);
    NppStreamHandler h(stream);
    nppSafeCall( nppiRectStdDev_32s32f_C1R(src.ptr<Npp32s>(), static_cast<int>(src.step), sqr.ptr<Npp32f>(), static_cast<int>(sqr.step),
                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, nppRect) );
    if (stream == 0)
        cudaSafeCall( cudaDeviceSynchronize() );
 #endif
 }
@@ -992,25 +975,19 @@ void cv::gpu::rectStdDev(const GpuMat& src, const GpuMat& sqr, GpuMat& dst, cons
 namespace
 {
    template<int n> struct NPPTypeTraits;
    template<> struct NPPTypeTraits<CV_8U>  { typedef Npp8u npp_type; };
    template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
    template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
    template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
    typedef NppStatus (*get_buf_size_c1_t)(NppiSize oSizeROI, int nLevels, int* hpBufferSize);
    typedef NppStatus (*get_buf_size_c4_t)(NppiSize oSizeROI, int nLevels[], int* hpBufferSize);
    template<int SDEPTH> struct NppHistogramEvenFuncC1
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
 	typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI, Npp32s * pHist,
 		    int nLevels, Npp32s nLowerLevel, Npp32s nUpperLevel, Npp8u * pBuffer);
    };
    template<int SDEPTH> struct NppHistogramEvenFuncC4
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, NppiSize oSizeROI,
            Npp32s * pHist[4], int nLevels[4], Npp32s nLowerLevel[4], Npp32s nUpperLevel[4], Npp8u * pBuffer);
@@ -1079,7 +1056,7 @@ namespace
    template<int SDEPTH> struct NppHistogramRangeFuncC1
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
        typedef Npp32s level_t;
        enum {LEVEL_TYPE_CODE=CV_32SC1};
@@ -1097,7 +1074,7 @@ namespace
    };
    template<int SDEPTH> struct NppHistogramRangeFuncC4
    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
+        typedef typename NppTypeTraits<SDEPTH>::npp_t src_t;
        typedef Npp32s level_t;
        enum {LEVEL_TYPE_CODE=CV_32SC1};
--- a/modules/gpu/src/initialization.cpp
+++ b/modules/gpu/src/initialization.cpp
@@ -171,8 +171,8 @@ bool cv::gpu::DeviceInfo::supports(cv::gpu::FeatureSet) const { throw_nogpu(); r
 bool cv::gpu::DeviceInfo::isCompatible() const { throw_nogpu(); return false; }
 void cv::gpu::DeviceInfo::query() { throw_nogpu(); }
 void cv::gpu::DeviceInfo::queryMemory(size_t&, size_t&) const { throw_nogpu(); }
-void cv::gpu::printCudaDeviceInfo(int device) { throw_nogpu(); }
+void cv::gpu::printCudaDeviceInfo(int) { throw_nogpu(); }
-void cv::gpu::printShortCudaDeviceInfo(int device) { throw_nogpu(); }
+void cv::gpu::printShortCudaDeviceInfo(int) { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
@@ -346,7 +346,6 @@ void cv::gpu::printCudaDeviceInfo(int device)
            convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
 #if (CUDART_VERSION >= 4000)
        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
        int memoryClock, memBusWidth, L2CacheSize;
        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );       
@@ -364,7 +363,7 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
-#endif
+
        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
@@ -375,11 +374,7 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
 #if CUDART_VERSION >= 4000
        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
 #else
        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
 #endif
        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
@@ -388,10 +383,8 @@ void cv::gpu::printCudaDeviceInfo(int device)
        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
 #if CUDART_VERSION >= 4000
        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
 #endif
        printf("  Compute Mode:\n");
        printf("      %s \n", computeMode[prop.computeMode]);
    }
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
@@ -48,6 +48,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
 void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&) { throw_nogpu(); }
 void cv::gpu::meanStdDev(const GpuMat&, Scalar&, Scalar&, GpuMat&) { throw_nogpu(); }
 double cv::gpu::norm(const GpuMat&, int) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, int, GpuMat&) { throw_nogpu(); return 0.0; }
 double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return 0.0; }
@@ -108,6 +109,12 @@ namespace
 // meanStdDev
 void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
 {
    GpuMat buf;
    meanStdDev(src, mean, stddev, buf);
 }
 void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev, GpuMat& buf)
 {
    CV_Assert(src.type() == CV_8UC1);
@@ -117,15 +124,12 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
    DeviceBuffer dbuf(2);
 #if CUDART_VERSION > 4000 
    int bufSize;
    nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
-    GpuMat buf(1, bufSize, CV_8UC1);
+    ensureSizeIsEnough(1, bufSize, CV_8UC1, buf);
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, buf.ptr<Npp8u>(), dbuf, (double*)dbuf + 1) );
 #else
    nppSafeCall( nppiMean_StdDev_8u_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), sz, dbuf, (double*)dbuf + 1) );
 #endif
    cudaSafeCall( cudaDeviceSynchronize() );
@@ -133,7 +137,6 @@ void cv::gpu::meanStdDev(const GpuMat& src, Scalar& mean, Scalar& stddev)
    dbuf.download(ptrs);
 }
 ////////////////////////////////////////////////////////////////////////
 // norm
@@ -151,7 +154,7 @@ double cv::gpu::norm(const GpuMat& src, int normType, GpuMat& buf)
        return absSum(src_single_channel, buf)[0];
    if (normType == NORM_L2)
-        return sqrt(sqrSum(src_single_channel, buf)[0]);
+        return std::sqrt(sqrSum(src_single_channel, buf)[0]);
    if (normType == NORM_INF)
    {
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
@@ -228,7 +228,7 @@ void cv::gpu::createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMa
    minMax(u_avg, 0, &uMax);
    minMax(v_avg, 0, &vMax);
-    float max_flow = static_cast<float>(sqrt(uMax * uMax + vMax * vMax));
+    float max_flow = static_cast<float>(std::sqrt(uMax * uMax + vMax * vMax));
    CreateOpticalFlowNeedleMap_gpu(u_avg, v_avg, vertex.ptr<float>(), colors.ptr<float>(), max_flow, 1.0f / u.cols, 1.0f / u.rows);
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@@ -160,7 +160,7 @@ void cv::gpu::FarnebackOpticalFlow::setPolynomialExpansionConsts(int n, double s
    double ig11, ig03, ig33, ig55;
    prepareGaussian(n, sigma, g, xg, xxg, ig11, ig03, ig33, ig55);
-    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, ig11, ig03, ig33, ig55);
+    device::optflow_farneback::setPolynomialExpansionConsts(n, g, xg, xxg, static_cast<float>(ig11), static_cast<float>(ig03), static_cast<float>(ig33), static_cast<float>(ig55));
 }
--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
@@ -429,11 +429,11 @@ void cv::gpu::ORB_GPU::setParams(size_t n_features, const ORB::CommonParams& det
    // pre-compute the end of a row in a circular patch
    int half_patch_size = params_.patch_size_ / 2;
    vector<int> u_max(half_patch_size + 1);
-    for (int v = 0; v <= half_patch_size * sqrt(2.f) / 2 + 1; ++v)
+    for (int v = 0; v <= half_patch_size * std::sqrt(2.f) / 2 + 1; ++v)
-        u_max[v] = cvRound(sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
+        u_max[v] = cvRound(std::sqrt(static_cast<float>(half_patch_size * half_patch_size - v * v)));
    // Make sure we are symmetric
-    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * sqrt(2.f) / 2; --v)
+    for (int v = half_patch_size, v_0 = 0; v >= half_patch_size * std::sqrt(2.f) / 2; --v)
    {
        while (u_max[v_0] == u_max[v_0 + 1])
            ++v_0;
--- a/modules/gpu/src/precomp.hpp
+++ b/modules/gpu/src/precomp.hpp
@@ -91,8 +91,8 @@
    #include "nvidia/NCVHaarObjectDetection.hpp"
    #include "nvidia/NCVBroxOpticalFlow.hpp"
-    #define CUDART_MINIMUM_REQUIRED_VERSION 4000
+    #define CUDART_MINIMUM_REQUIRED_VERSION 4010
-    #define NPP_MINIMUM_REQUIRED_VERSION 4000
+    #define NPP_MINIMUM_REQUIRED_VERSION 4100
    #if (CUDART_VERSION < CUDART_MINIMUM_REQUIRED_VERSION)
        #error "Insufficient Cuda Runtime library version, please update it."
--- a/modules/gpu/test/test_arithm.cpp
+++ b/modules/gpu/test/test_arithm.cpp
@@ -69,10 +69,10 @@ PARAM_TEST_CASE(ArithmTestBase, cv::gpu::DeviceInfo, MatType, UseRoi)
        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        mat1 = randomMat(rng, size, type, 1, 16, false);
+        mat1 = randomMat(rng, size, type, 5, 16, false);
-        mat2 = randomMat(rng, size, type, 1, 16, false);
+        mat2 = randomMat(rng, size, type, 5, 16, false);
-        val = cv::Scalar(rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0), rng.uniform(0.1, 3.0));
+        val = cv::Scalar(rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3), rng.uniform(1, 3));
    }
 };
@@ -115,7 +115,8 @@ TEST_P(Add, Scalar)
 INSTANTIATE_TEST_CASE_P(Arithm, Add, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
@@ -157,7 +158,8 @@ TEST_P(Subtract, Scalar)
 INSTANTIATE_TEST_CASE_P(Arithm, Subtract, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC2, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC2, CV_32SC3, CV_32FC1, CV_32FC2, CV_32FC3, CV_32FC4),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
@@ -199,7 +201,8 @@ TEST_P(Multiply, Scalar)
 INSTANTIATE_TEST_CASE_P(Arithm, Multiply, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
@@ -220,7 +223,7 @@ TEST_P(Divide, Array)
    gpuRes.download(dst);
-    EXPECT_MAT_NEAR(dst_gold, dst, 1.0);
+    EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1);
 }
 TEST_P(Divide, Scalar) 
@@ -236,12 +239,13 @@ TEST_P(Divide, Scalar)
    gpuRes.download(dst);
-    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
+    EXPECT_MAT_NEAR(dst_gold, dst, mat1.depth() == CV_32F ? 1e-5 : 1);
 }
 INSTANTIATE_TEST_CASE_P(Arithm, Divide, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_16SC1, CV_16SC3, CV_16SC4, 
                               CV_32SC1, CV_32SC3, CV_32FC1, CV_32FC3, CV_32FC4),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
@@ -312,6 +316,83 @@ INSTANTIATE_TEST_CASE_P(Arithm, Absdiff, Combine(
                        Values(CV_8UC1, CV_16UC1, CV_32SC1, CV_32FC1),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
 // abs
 struct Abs : ArithmTestBase {};
 TEST_P(Abs, Array) 
 {
    cv::Mat dst_gold = cv::abs(mat1);
    cv::Mat dst;
    cv::gpu::GpuMat gpuRes;
    cv::gpu::abs(loadMat(mat1, useRoi), gpuRes);
    gpuRes.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 INSTANTIATE_TEST_CASE_P(Arithm, Abs, Combine(
                        ALL_DEVICES,
                        Values(CV_16SC1, CV_32FC1),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
 // Sqr
 struct Sqr : ArithmTestBase {};
 TEST_P(Sqr, Array) 
 {
    cv::Mat dst_gold;
    cv::multiply(mat1, mat1, dst_gold);
    cv::Mat dst;
    cv::gpu::GpuMat gpuRes;
    cv::gpu::sqr(loadMat(mat1, useRoi), gpuRes);
    gpuRes.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 INSTANTIATE_TEST_CASE_P(Arithm, Sqr, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_32FC1),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
 // Sqrt
 struct Sqrt : ArithmTestBase {};
 TEST_P(Sqrt, Array) 
 {
    cv::Mat dst_gold;
    cv::sqrt(mat1, dst_gold);
    cv::Mat dst;
    cv::gpu::GpuMat gpuRes;
    cv::gpu::sqrt(loadMat(mat1, useRoi), gpuRes);
    gpuRes.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 INSTANTIATE_TEST_CASE_P(Arithm, Sqrt, Combine(
                        ALL_DEVICES,
                        Values(MatType(CV_32FC1)),
                        USE_ROI));
 ////////////////////////////////////////////////////////////////////////////////
 // compare
@@ -513,7 +594,7 @@ TEST_P(Flip, Accuracy)
 INSTANTIATE_TEST_CASE_P(Arithm, Flip, Combine(
                        ALL_DEVICES,
-                        Values(CV_8UC1, CV_8UC4),
+                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4, CV_32FC1, CV_32FC3, CV_32FC4),
                        Values((int)FLIP_BOTH, (int)FLIP_X, (int)FLIP_Y),
                        USE_ROI));
@@ -1329,6 +1410,90 @@ INSTANTIATE_TEST_CASE_P(Arithm, Bitwise, Combine(
                        ALL_DEVICES,
                        ALL_TYPES));
 PARAM_TEST_CASE(BitwiseScalar, cv::gpu::DeviceInfo, MatType)
 {
    cv::gpu::DeviceInfo devInfo;
    int type;
    cv::Size size;
    cv::Mat mat;
    cv::Scalar sc;
    virtual void SetUp() 
    {
        devInfo = GET_PARAM(0);
        type = GET_PARAM(1);
        cv::gpu::setDevice(devInfo.deviceID());
        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
        mat.create(size, type);
        for (int i = 0; i < mat.rows; ++i)
        {
            cv::Mat row(1, static_cast<int>(mat.cols * mat.elemSize()), CV_8U, (void*)mat.ptr(i));
            rng.fill(row, cv::RNG::UNIFORM, cv::Scalar(0), cv::Scalar(255));
        }
        sc = cv::Scalar(rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255), rng.uniform(0, 255));
    }
 };
 TEST_P(BitwiseScalar, Or) 
 {
    cv::Mat dst_gold;
    cv::bitwise_or(mat, sc, dst_gold);
    cv::Mat dst;
    cv::gpu::GpuMat dev_dst;
    cv::gpu::bitwise_or(loadMat(mat), sc, dev_dst);
    dev_dst.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 TEST_P(BitwiseScalar, And) 
 {
    cv::Mat dst_gold;
    cv::bitwise_and(mat, sc, dst_gold);
    cv::Mat dst;
    cv::gpu::GpuMat dev_dst;
    cv::gpu::bitwise_and(loadMat(mat), sc, dev_dst);
    dev_dst.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 TEST_P(BitwiseScalar, Xor) 
 {
    cv::Mat dst_gold;
    cv::bitwise_xor(mat, sc, dst_gold);
    cv::Mat dst;
    cv::gpu::GpuMat dev_dst;
    cv::gpu::bitwise_xor(loadMat(mat), sc, dev_dst);
    dev_dst.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 INSTANTIATE_TEST_CASE_P(Arithm, BitwiseScalar, Combine(
                        ALL_DEVICES,
                        Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4, CV_32SC1, CV_32SC3, CV_32SC4)));
 //////////////////////////////////////////////////////////////////////////////
 // addWeighted
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -2377,6 +2377,49 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, Combine(
                        Values(CV_8U, CV_16U, CV_32F),
                        USE_ROI));
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // swapChannels
 PARAM_TEST_CASE(SwapChannels, cv::gpu::DeviceInfo, UseRoi)
 {
    cv::gpu::DeviceInfo devInfo;
    bool useRoi;
    cv::Mat img;
    cv::Mat dst_gold;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        useRoi = GET_PARAM(1);
        cv::gpu::setDevice(devInfo.deviceID());
        cv::Mat imgBase = readImage("stereobm/aloe-L.png");
        ASSERT_FALSE(imgBase.empty());
        cv::cvtColor(imgBase, img, cv::COLOR_BGR2BGRA);
        cv::cvtColor(img, dst_gold, cv::COLOR_BGRA2RGBA);
    }
 };
 TEST_P(SwapChannels, Accuracy)
 {
    cv::gpu::GpuMat gpuImage = loadMat(img, useRoi);
    const int dstOrder[] = {2, 1, 0, 3};
    cv::gpu::swapChannels(gpuImage, dstOrder);
    cv::Mat dst;
    gpuImage.download(dst);
    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
 }
 INSTANTIATE_TEST_CASE_P(ImgProc, SwapChannels, Combine(ALL_DEVICES, USE_ROI));
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // histograms
--- a/modules/gpu/test/test_main.cpp
+++ b/modules/gpu/test/test_main.cpp
@@ -136,7 +136,7 @@ int main(int argc, char** argv)
 #else // HAVE_CUDA
-int main(int argc, char** argv)
+int main()
 {
    printf("OpenCV was built without CUDA support\n");
    return 0;
--- a/modules/highgui/src/window.cpp
+++ b/modules/highgui/src/window.cpp
@@ -146,7 +146,6 @@ CV_IMPL double cvGetWindowProperty(const char* name, int prop_id)
    default:
        return -1;
    }
    return -1;
 }
 void cv::namedWindow( const string& winname, int flags )
--- a/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
+++ b/modules/stitching/include/opencv2/stitching/detail/seam_finders.hpp
@@ -105,6 +105,8 @@ public:
    GraphCutSeamFinder(int cost_type = COST_COLOR_GRAD, float terminal_cost = 10000.f,
                       float bad_region_penalty = 1000.f);
    ~GraphCutSeamFinder();
    void find(const std::vector<Mat> &src, const std::vector<Point> &corners,
              std::vector<Mat> &masks);
--- a/modules/stitching/src/seam_finders.cpp
+++ b/modules/stitching/src/seam_finders.cpp
@@ -411,6 +411,8 @@ void GraphCutSeamFinder::Impl::findInPair(size_t first, size_t second, Rect roi)
 GraphCutSeamFinder::GraphCutSeamFinder(int cost_type, float terminal_cost, float bad_region_penalty)
    : impl_(new Impl(cost_type, terminal_cost, bad_region_penalty)) {}
 GraphCutSeamFinder::~GraphCutSeamFinder() {}
 void GraphCutSeamFinder::find(const vector<Mat> &src, const vector<Point> &corners,
                              vector<Mat> &masks)
--- a/samples/cpp/point_cloud.cpp
+++ b/samples/cpp/point_cloud.cpp
@@ -247,7 +247,7 @@ inline int clamp(int val, int minVal, int maxVal)
    return max(min(val, maxVal), minVal);
 }
-void PointCloudRenderer::onMouseEvent(int event, int x, int y, int flags)
+void PointCloudRenderer::onMouseEvent(int event, int x, int y, int /*flags*/)
 {
    static int oldx = x;
    static int oldy = y;
--- a/samples/gpu/alpha_comp.cpp
+++ b/samples/gpu/alpha_comp.cpp
@@ -0,0 +1,68 @@
 #include <iostream>
 #include "opencv2/core/opengl_interop.hpp"
 #include "opencv2/highgui/highgui.hpp"
 #include "opencv2/gpu/gpu.hpp"
 using namespace std;
 using namespace cv;
 using namespace cv::gpu;
 int main()
 {
    cout << "This program demonstrates using alphaComp" << endl;
    cout << "Press SPACE to change compositing operation" << endl;
    cout << "Press ESC to exit" << endl;
    namedWindow("First Image", WINDOW_NORMAL);
    namedWindow("Second Image", WINDOW_NORMAL);
    namedWindow("Result", WINDOW_OPENGL);
    setGlDevice();
    Mat src1(640, 480, CV_8UC4, Scalar::all(0));
    Mat src2(640, 480, CV_8UC4, Scalar::all(0));
    rectangle(src1, Rect(50, 50, 200, 200), Scalar(0, 0, 255, 128), 30);
    rectangle(src2, Rect(100, 100, 200, 200), Scalar(255, 0, 0, 128), 30);
    GpuMat d_src1(src1);
    GpuMat d_src2(src2);
    GpuMat d_res;
    imshow("First Image", src1);
    imshow("Second Image", src2);
    int alpha_op = ALPHA_OVER;
    const char* op_names[] = 
    { 
        "ALPHA_OVER", "ALPHA_IN", "ALPHA_OUT", "ALPHA_ATOP", "ALPHA_XOR", "ALPHA_PLUS", "ALPHA_OVER_PREMUL", "ALPHA_IN_PREMUL", "ALPHA_OUT_PREMUL",
        "ALPHA_ATOP_PREMUL", "ALPHA_XOR_PREMUL", "ALPHA_PLUS_PREMUL", "ALPHA_PREMUL"
    };
    while (true)
    {
        cout << op_names[alpha_op] << endl;
        alphaComp(d_src1, d_src2, d_res, alpha_op);
        imshow("Result", d_res);
        char key = static_cast<char>(waitKey());
        if (key == 27)
            break;
        if (key == 32)
        {
            ++alpha_op;
            if (alpha_op > ALPHA_PREMUL)
                alpha_op = ALPHA_OVER; 
        }
    }
    return 0;
 }
--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
@@ -226,7 +226,7 @@ int main(int argc, const char* argv[])
                break;
            case 'S':
-                if (currentFrame < frames.size() - 1)
+                if (currentFrame < static_cast<int>(frames.size()) - 1)
                    ++currentFrame;
                imshow("Interpolated frame", frames[currentFrame]);
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@@ -26,7 +26,7 @@ void colorizeFlow(const Mat &u, const Mat &v, Mat &dst)
    minMaxLoc(v, &vMin, &vMax, 0, 0);
    uMin = ::abs(uMin); uMax = ::abs(uMax);
    vMin = ::abs(vMin); vMax = ::abs(vMax);
-    float dMax = ::max(::max(uMin, uMax), ::max(vMin, vMax));
+    float dMax = static_cast<float>(::max(::max(uMin, uMax), ::max(vMin, vMax)));
    dst.create(u.size(), CV_8UC3);
    for (int y = 0; y < u.rows; ++y)
@@ -111,11 +111,11 @@ int main(int argc, char **argv)
        s.str("");
        s << "opt. flow FPS: " << cvRound((getTickFrequency()/(tc1-tc0)));
-        putText(image, s.str(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2.);
+        putText(image, s.str(), Point(5, 65), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2);
        s.str("");
        s << "total FPS: " << cvRound((getTickFrequency()/(t1-t0)));
-        putText(image, s.str(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2.);
+        putText(image, s.str(), Point(5, 105), FONT_HERSHEY_SIMPLEX, 1., Scalar(255,0,255), 2);
        imshow("flow", image);
--- a/samples/gpu/performance/performance.cpp
+++ b/samples/gpu/performance/performance.cpp
@@ -63,7 +63,7 @@ void TestSystem::finishCurrentSubtest()
    double cpu_time = cpu_elapsed_ / getTickFrequency() * 1000.0;
    double gpu_time = gpu_elapsed_ / getTickFrequency() * 1000.0;
-    double speedup = static_cast<double>(cpu_elapsed_) / std::max((int64)1, gpu_elapsed_);
+    double speedup = static_cast<double>(cpu_elapsed_) / std::max(1.0, gpu_elapsed_);
    speedup_total_ += speedup;
    printMetrics(cpu_time, gpu_time, speedup);
--- a/samples/gpu/performance/performance.h
+++ b/samples/gpu/performance/performance.h
@@ -127,8 +127,10 @@ private:
    std::stringstream cur_subtest_description_;
    bool cur_subtest_is_empty_;
-    int64 cpu_started_, cpu_elapsed_;
+    int64 cpu_started_;
-    int64 gpu_started_, gpu_elapsed_;
+    int64 gpu_started_;
    double cpu_elapsed_;
    double gpu_elapsed_;
    double speedup_total_;
    int num_subtests_called_;
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1199,7 +1199,7 @@ TEST(FarnebackOpticalFlow)
    if (frame1.empty()) throw runtime_error("can't open " + datasets[i] + "2.png");
    gpu::FarnebackOpticalFlow calc;
-    calc.fastPyramids = fastPyramids;
+    calc.fastPyramids = fastPyramids != 0;
    calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0;
    gpu::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy;