added __forceinline__ to device functions

fixed BFM warning ("cannot tell what pointer points to")
2011-06-14 11:27:32 +00:00 · 2011-06-14 11:27:32 +00:00 · 1c1a61dd37
commit 1c1a61dd37
parent 79f3260b8e
21 changed files with 978 additions and 908 deletions
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@ -56,7 +56,7 @@ namespace cv
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile

 #if defined(__CUDACC__) 
-    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ 
+    #define __CV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
 #else
    #define __CV_GPU_HOST_DEVICE__
 #endif
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@ -42,6 +42,7 @@

 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/datamov_utils.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@ -60,7 +61,7 @@ namespace cv { namespace gpu { namespace bfmatcher
    public:
        explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
        
-        __device__ bool operator()(int queryIdx, int trainIdx) const
+        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
        {            
            return mask.ptr(queryIdx)[trainIdx] != 0;
        }
@ -74,14 +75,15 @@ namespace cv { namespace gpu { namespace bfmatcher
    public:
        explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}

-        __device__ void nextMask()
+        __device__ __forceinline__ void nextMask()
        {
            curMask = *maskCollection++;
        }
        
-        __device__ bool operator()(int queryIdx, int trainIdx) const
-        {            
-            return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;
+        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
+        {
+            uchar val;
+            return curMask.data == 0 || (ForceGlob<uchar>::Load(curMask.ptr(queryIdx), trainIdx, val), (val != 0));
        }

    private:
@ -92,10 +94,10 @@ namespace cv { namespace gpu { namespace bfmatcher
    class WithOutMask
    {
    public:
-        __device__ void nextMask()
+        __device__ __forceinline__ void nextMask()
        {
        }
-        __device__ bool operator()(int queryIdx, int trainIdx) const
+        __device__ __forceinline__ bool operator()(int queryIdx, int trainIdx) const
        {
            return true;
        }
@ -132,19 +134,19 @@ namespace cv { namespace gpu { namespace bfmatcher
        typedef int ResultType;
        typedef int ValueType;

-        __device__ L1Dist() : mySum(0) {}
+        __device__ __forceinline__ L1Dist() : mySum(0) {}

-        __device__ void reduceIter(int val1, int val2)
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
        {
            mySum = __sad(val1, val2, mySum);
        }

-        template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(int* sdiff_row)
        {
            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
        }

-        __device__ operator int() const
+        __device__ __forceinline__ operator int() const
        {
            return mySum;
        }
@ -158,19 +160,19 @@ namespace cv { namespace gpu { namespace bfmatcher
        typedef float ResultType;
        typedef float ValueType;

-        __device__ L1Dist() : mySum(0.0f) {}
+        __device__ __forceinline__ L1Dist() : mySum(0.0f) {}

-        __device__ void reduceIter(float val1, float val2)
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
        {
            mySum += fabs(val1 - val2);
        }

-        template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(float* sdiff_row)
        {
            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
        }

-        __device__ operator float() const
+        __device__ __forceinline__ operator float() const
        {
            return mySum;
        }
@ -185,20 +187,20 @@ namespace cv { namespace gpu { namespace bfmatcher
        typedef float ResultType;
        typedef float ValueType;

-        __device__ L2Dist() : mySum(0.0f) {}
+        __device__ __forceinline__ L2Dist() : mySum(0.0f) {}

-        __device__ void reduceIter(float val1, float val2)
+        __device__ __forceinline__ void reduceIter(float val1, float val2)
        {
            float reg = val1 - val2;
            mySum += reg * reg;
        }

-        template <int BLOCK_DIM_X> __device__ void reduceAll(float* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(float* sdiff_row)
        {
            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
        }

-        __device__ operator float() const
+        __device__ __forceinline__ operator float() const
        {
            return sqrtf(mySum);
        }
@ -213,19 +215,19 @@ namespace cv { namespace gpu { namespace bfmatcher
        typedef int ResultType;
        typedef int ValueType;

-        __device__ HammingDist() : mySum(0) {}
+        __device__ __forceinline__ HammingDist() : mySum(0) {}

-        __device__ void reduceIter(int val1, int val2)
+        __device__ __forceinline__ void reduceIter(int val1, int val2)
        {
            mySum += __popc(val1 ^ val2);
        }

-        template <int BLOCK_DIM_X> __device__ void reduceAll(int* sdiff_row)
+        template <int BLOCK_DIM_X> __device__ __forceinline__ void reduceAll(int* sdiff_row)
        {
            SumReductor<BLOCK_DIM_X>::reduce(sdiff_row, mySum);
        }

-        __device__ operator int() const
+        __device__ __forceinline__ operator int() const
        {
            return mySum;
        }
@ -241,7 +243,11 @@ namespace cv { namespace gpu { namespace bfmatcher
    __device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
    {
        for (int i = threadIdx.x; i < desc_len; i += BLOCK_DIM_X)
-            dist.reduceIter(queryDescs[i], trainDescs[i]);
+        {
+            T trainVal;
+            ForceGlob<T>::Load(trainDescs, i, trainVal);
+            dist.reduceIter(queryDescs[i], trainVal);
+        }

        dist.reduceAll<BLOCK_DIM_X>(sdiff_row);
    }
@ -282,7 +288,9 @@ namespace cv { namespace gpu { namespace bfmatcher
        {
            if (ind < desc_len)
            {
-                dist.reduceIter(*queryVals, trainDescs[ind]);
+                T trainVal;
+                ForceGlob<T>::Load(trainDescs, ind, trainVal);
+                dist.reduceIter(*queryVals, trainVal);

                ++queryVals;

@ -293,7 +301,9 @@ namespace cv { namespace gpu { namespace bfmatcher
        template <typename Dist, typename T>
        static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
        {
-            dist.reduceIter(*queryVals, *trainDescs);
+            T trainVal;
+            ForceGlob<T>::Load(trainDescs, 0, trainVal);
+            dist.reduceIter(*queryVals, trainVal);

            ++queryVals;
            trainDescs += blockDim.x;
@ -304,13 +314,13 @@ namespace cv { namespace gpu { namespace bfmatcher
    template <> struct UnrollDescDiff<0>
    {
        template <typename Dist, typename T>
-        static __device__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, 
+        static __device__ __forceinline__ void calcCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, 
            Dist& dist, int ind)
        {
        }

        template <typename Dist, typename T>
-        static __device__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
+        static __device__ __forceinline__ void calcWithoutCheck(const typename Dist::ValueType* queryVals, const T* trainDescs, Dist& dist)
        {
        }
    };
@ -320,7 +330,7 @@ namespace cv { namespace gpu { namespace bfmatcher
    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
    {
        template <typename Dist, typename T>
-        static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
+        static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
        {
            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(queryVals, trainDescs, desc_len, dist, threadIdx.x);
        }
@ -329,14 +339,14 @@ namespace cv { namespace gpu { namespace bfmatcher
    struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
    {
        template <typename Dist, typename T>
-        static __device__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
+        static __device__ __forceinline__ void calc(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist)
        {
            UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(queryVals, trainDescs + threadIdx.x, dist);
        }
    };

    template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
-    __device__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
+    __device__ __forceinline__ void reduceDescDiffCached(const typename Dist::ValueType* queryVals, const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row)
    {        
        DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(queryVals, trainDescs, desc_len, dist);
        
@ -419,13 +429,13 @@ namespace cv { namespace gpu { namespace bfmatcher
    class ReduceDescCalculatorSimple
    {
    public:
-        __device__ void prepare(const T* queryDescs_, int, void*)
+        __device__ __forceinline__ void prepare(const T* queryDescs_, int, void*)
        {
            queryDescs = queryDescs_;
        }

        template <typename Dist>
-        __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
+        __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
        {
            reduceDescDiff<BLOCK_DIM_X>(queryDescs, trainDescs, desc_len, dist, sdiff_row);
        }
@ -438,13 +448,13 @@ namespace cv { namespace gpu { namespace bfmatcher
    class ReduceDescCalculatorCached
    {
    public:
-        __device__ void prepare(const T* queryDescs, int desc_len, U* smem)
+        __device__ __forceinline__ void prepare(const T* queryDescs, int desc_len, U* smem)
        {
            loadDescsVals<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, queryVals, smem);
        }

        template <typename Dist>
-        __device__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
+        __device__ __forceinline__ void calc(const T* trainDescs, int desc_len, Dist& dist, typename Dist::ResultType* sdiff_row) const
        {
            reduceDescDiffCached<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>(queryVals, trainDescs, desc_len, dist, sdiff_row);
        }
@ -496,13 +506,13 @@ namespace cv { namespace gpu { namespace bfmatcher
        }

        template <typename Dist, typename ReduceDescCalculator, typename Mask>
-        __device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
+        __device__ __forceinline__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc, 
            typename Dist::ResultType& myMin, int& myBestTrainIdx, int& myBestImgIdx, typename Dist::ResultType* sdiff_row) const
        {
            matchDescs<Dist>(queryIdx, 0, trainDescs, m, reduceDescCalc, myMin, myBestTrainIdx, myBestImgIdx, sdiff_row);
        }

-        __device__ int desc_len() const
+        __device__ __forceinline__ int desc_len() const
        {
            return trainDescs.cols;
        }
@ -532,7 +542,7 @@ namespace cv { namespace gpu { namespace bfmatcher
            }
        }

-        __device__ int desc_len() const
+        __device__ __forceinline__ int desc_len() const
        {
            return desclen;
        }
--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@ -56,7 +56,7 @@ namespace cv { namespace gpu

        struct TransformOp
        {
-            __device__ float3 operator()(float3 p) const
+            __device__ __forceinline__ float3 operator()(float3 p) const
            {
                return make_float3(
                        crot0.x * p.x + crot0.y * p.y + crot0.z * p.z + ctransl.x,
@ -89,7 +89,7 @@ namespace cv { namespace gpu

        struct ProjectOp
        {
-            __device__ float2 operator()(float3 p) const
+            __device__ __forceinline__ float2 operator()(float3 p) const
            {
                // Rotate and translate in 3D
                float3 t = make_float3(
@ -128,7 +128,7 @@ namespace cv { namespace gpu
            return SOLVE_PNP_RANSAC_MAX_NUM_ITERS;
        }

-        __device__ float sqr(float x)
+        __device__ __forceinline__ float sqr(float x)
        {
            return x * x;
        }
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@ -59,38 +59,38 @@ namespace cv { namespace gpu { namespace color
    template<> struct ColorChannel<uchar>
    {
        typedef float worktype_f;
-        static __device__ uchar max() { return UCHAR_MAX; }
-        static __device__ uchar half() { return (uchar)(max()/2 + 1); }
+        static __device__ __forceinline__ uchar max() { return UCHAR_MAX; }
+        static __device__ __forceinline__ uchar half() { return (uchar)(max()/2 + 1); }
    };
    template<> struct ColorChannel<ushort>
    {
        typedef float worktype_f;
-        static __device__ ushort max() { return USHRT_MAX; }
-        static __device__ ushort half() { return (ushort)(max()/2 + 1); }
+        static __device__ __forceinline__ ushort max() { return USHRT_MAX; }
+        static __device__ __forceinline__ ushort half() { return (ushort)(max()/2 + 1); }
    };
    template<> struct ColorChannel<float>
    {
        typedef float worktype_f;
-        static __device__ float max() { return 1.f; }
-        static __device__ float half() { return 0.5f; }
+        static __device__ __forceinline__ float max() { return 1.f; }
+        static __device__ __forceinline__ float half() { return 0.5f; }
    };

    template <typename T>
-    __device__ void setAlpha(typename TypeVec<T, 3>::vec_t& vec, T val)
+    __device__ __forceinline__ void setAlpha(typename TypeVec<T, 3>::vec_t& vec, T val)
    {
    }
    template <typename T>
-    __device__ void setAlpha(typename TypeVec<T, 4>::vec_t& vec, T val)
+    __device__ __forceinline__ void setAlpha(typename TypeVec<T, 4>::vec_t& vec, T val)
    {
        vec.w = val;
    }
    template <typename T>
-    __device__ T getAlpha(const typename TypeVec<T, 3>::vec_t& vec)
+    __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 3>::vec_t& vec)
    {
        return ColorChannel<T>::max();
    }
    template <typename T>
-    __device__ T getAlpha(const typename TypeVec<T, 4>::vec_t& vec)
+    __device__ __forceinline__ T getAlpha(const typename TypeVec<T, 4>::vec_t& vec)
    {
        return vec.w;
    }
@ -114,7 +114,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB2RGB(int bidx) : bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;

@ -179,7 +179,7 @@ namespace cv { namespace gpu { namespace color
    template <> struct RGB5x52RGBConverter<5>
    {
        template <typename D>
-        static __device__ void cvt(uint src, D& dst, int bidx)
+        static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx)
        {            
            (&dst.x)[bidx] = (uchar)(src << 3);
            dst.y = (uchar)((src >> 2) & ~7);
@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace color
    template <> struct RGB5x52RGBConverter<6>
    {
        template <typename D>
-        static __device__ void cvt(uint src, D& dst, int bidx)
+        static __device__ __forceinline__ void cvt(uint src, D& dst, int bidx)
        {            
            (&dst.x)[bidx] = (uchar)(src << 3);
            dst.y = (uchar)((src >> 3) & ~3);
@ -206,7 +206,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB5x52RGB(int bidx) : bidx(bidx) {}

-        __device__ dst_t operator()(ushort src) const
+        __device__ __forceinline__ dst_t operator()(ushort src) const
        {
            dst_t dst;
            RGB5x52RGBConverter<GREEN_BITS>::cvt((uint)src, dst, bidx);
@ -221,18 +221,18 @@ namespace cv { namespace gpu { namespace color
    template<> struct RGB2RGB5x5Converter<6> 
    {
        template <typename T>
-        static __device__ ushort cvt(const T& src, int bidx)
+        static __device__ __forceinline__ ushort cvt(const T& src, int bidx)
        {
            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
        }
    };
    template<> struct RGB2RGB5x5Converter<5> 
    {
-        static __device__ ushort cvt(const uchar3& src, int bidx)
+        static __device__ __forceinline__ ushort cvt(const uchar3& src, int bidx)
        {
            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
        }
-        static __device__ ushort cvt(const uchar4& src, int bidx)
+        static __device__ __forceinline__ ushort cvt(const uchar4& src, int bidx)
        {
            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0));
        }
@ -245,7 +245,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB2RGB5x5(int bidx) : bidx(bidx) {}

-        __device__ ushort operator()(const src_t& src)
+        __device__ __forceinline__ ushort operator()(const src_t& src)
        {
            return RGB2RGB5x5Converter<GREEN_BITS>::cvt(src, bidx);
        }
@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace color
        typedef T src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-        __device__ dst_t operator()(const T& src) const
+        __device__ __forceinline__ dst_t operator()(const T& src) const
        {
            dst_t dst;

@ -313,14 +313,14 @@ namespace cv { namespace gpu { namespace color
    template <int GREEN_BITS> struct Gray2RGB5x5Converter;
    template<> struct Gray2RGB5x5Converter<6> 
    {
-        static __device__ ushort cvt(uint t)
+        static __device__ __forceinline__ ushort cvt(uint t)
        {
            return (ushort)((t >> 3) | ((t & ~3) << 3) | ((t & ~7) << 8));
        }
    };
    template<> struct Gray2RGB5x5Converter<5> 
    {
-        static __device__ ushort cvt(uint t)
+        static __device__ __forceinline__ ushort cvt(uint t)
        {
            t >>= 3;
            return (ushort)(t | (t << 5) | (t << 10));
@ -332,7 +332,7 @@ namespace cv { namespace gpu { namespace color
        typedef uchar src_t;
        typedef ushort dst_t;

-        __device__ ushort operator()(uchar src) const
+        __device__ __forceinline__ ushort operator()(uchar src) const
        {
            return Gray2RGB5x5Converter<GREEN_BITS>::cvt((uint)src);
        }
@ -406,14 +406,14 @@ namespace cv { namespace gpu { namespace color
    template <int GREEN_BITS> struct RGB5x52GrayConverter;
    template<> struct RGB5x52GrayConverter<6> 
    {
-        static __device__ uchar cvt(uint t)
+        static __device__ __forceinline__ uchar cvt(uint t)
        {
            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 3) & 0xfc) * G2Y + ((t >> 8) & 0xf8) * R2Y, yuv_shift);
        }
    };
    template<> struct RGB5x52GrayConverter<5> 
    {
-        static __device__ uchar cvt(uint t)
+        static __device__ __forceinline__ uchar cvt(uint t)
        {
            return (uchar)CV_DESCALE(((t << 3) & 0xf8) * B2Y + ((t >> 2) & 0xf8) * G2Y + ((t >> 7) & 0xf8) * R2Y, yuv_shift);
        }
@ -424,18 +424,18 @@ namespace cv { namespace gpu { namespace color
        typedef ushort src_t;
        typedef uchar dst_t;

-        __device__ uchar operator()(ushort src) const
+        __device__ __forceinline__ uchar operator()(ushort src) const
        {
            return RGB5x52GrayConverter<GREEN_BITS>::cvt((uint)src);
        }
    };

    template <typename T>
-    __device__ T RGB2GrayConvert(const T* src, int bidx)
+    __device__ __forceinline__ T RGB2GrayConvert(const T* src, int bidx)
    {
        return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
    }
-     __device__ float RGB2GrayConvert(const float* src, int bidx)
+     __device__ __forceinline__ float RGB2GrayConvert(const float* src, int bidx)
    {
        const float cr = 0.299f;
        const float cg = 0.587f;
@ -451,7 +451,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB2Gray(int bidx) : bidx(bidx) {}

-        __device__ T operator()(const src_t& src)
+        __device__ __forceinline__ T operator()(const src_t& src)
        {
            return RGB2GrayConvert(&src.x, bidx);
        }
@ -515,7 +515,7 @@ namespace cv { namespace gpu { namespace color
    __constant__ float cYCrCbCoeffs_f[5];
    
    template <typename T, typename D>
-    __device__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
+    __device__ __forceinline__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
    {
        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);

@ -528,7 +528,7 @@ namespace cv { namespace gpu { namespace color
        dst.z = saturate_cast<T>(Cb);
    }
    template <typename D>
-    static __device__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
+    static __device__ __forceinline__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
    {
        dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2];
        dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel<float>::half();
@ -561,7 +561,7 @@ namespace cv { namespace gpu { namespace color

        RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase<T>(coeffs), bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            RGB2YCrCbConvert(&src.x, dst, bidx);
@ -573,7 +573,7 @@ namespace cv { namespace gpu { namespace color
    };
    
    template <typename T, typename D>
-    __device__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
+    __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
    {
        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[3], yuv_shift);
        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * cYCrCbCoeffs_i[1], yuv_shift);
@ -584,7 +584,7 @@ namespace cv { namespace gpu { namespace color
        dst[bidx^2] = saturate_cast<D>(r);
    }
    template <typename T>
-    __device__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
+    __device__ __forceinline__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
    {
        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[3];
        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[1];
@ -617,7 +617,7 @@ namespace cv { namespace gpu { namespace color

        YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase<T>(coeffs), bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;

@ -725,14 +725,14 @@ namespace cv { namespace gpu { namespace color
    __constant__ float cXYZ_D65f[9];

    template <typename T, typename D>
-    __device__ void RGB2XYZConvert(const T* src, D& dst)
+    __device__ __forceinline__ void RGB2XYZConvert(const T* src, D& dst)
    {
        dst.x = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift));
        dst.y = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift));
        dst.z = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift));
    }
    template <typename D>
-    __device__ void RGB2XYZConvert(const float* src, D& dst)
+    __device__ __forceinline__ void RGB2XYZConvert(const float* src, D& dst)
    {
        dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2];
        dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5];
@ -765,7 +765,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase<T>(coeffs) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            RGB2XYZConvert(&src.x, dst);
@ -774,14 +774,14 @@ namespace cv { namespace gpu { namespace color
    };

    template <typename T, typename D>
-    __device__ void XYZ2RGBConvert(const T& src, D* dst)
+    __device__ __forceinline__ void XYZ2RGBConvert(const T& src, D* dst)
    {
        dst[0] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift));
 	    dst[1] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift));
 	    dst[2] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift));
    }
    template <typename T>
-    __device__ void XYZ2RGBConvert(const T& src, float* dst)
+    __device__ __forceinline__ void XYZ2RGBConvert(const T& src, float* dst)
    {
        dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2];
 	    dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5];
@ -814,7 +814,7 @@ namespace cv { namespace gpu { namespace color
        
        explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase<T>(coeffs) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            XYZ2RGBConvert(src, &dst.x);
@ -987,7 +987,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB2HSV(int bidx) : bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            RGB2HSVConvert<HR>(&src.x, dst, bidx);
@ -1062,7 +1062,7 @@ namespace cv { namespace gpu { namespace color

        explicit HSV2RGB(int bidx) : bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            HSV2RGBConvert<HR>(src, &dst.x, bidx);
@ -1214,7 +1214,7 @@ namespace cv { namespace gpu { namespace color

        explicit RGB2HLS(int bidx) : bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            RGB2HLSConvert<HR>(&src.x, dst, bidx);
@ -1295,7 +1295,7 @@ namespace cv { namespace gpu { namespace color

        explicit HLS2RGB(int bidx) : bidx(bidx) {}

-        __device__ dst_t operator()(const src_t& src) const
+        __device__ __forceinline__ dst_t operator()(const src_t& src) const
        {
            dst_t dst;
            HLS2RGBConvert<HR>(src, &dst.x, bidx);
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T1, typename T2>
    struct NotEqual
    {
-        __device__ uchar operator()(const T1& src1, const T2& src2)
+        __device__ __forceinline__ uchar operator()(const T1& src1, const T2& src2)
        {
            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
        }
@ -91,7 +91,7 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    struct UnOp<T, UN_OP_NOT>
    { 
-        static __device__ T call(T v) { return ~v; }
+        static __device__ __forceinline__ T call(T v) { return ~v; }
    };


@ -199,20 +199,20 @@ namespace cv { namespace gpu { namespace mathfunc
    template <typename T>
    struct BinOp<T, BIN_OP_OR>
    { 
-        static __device__ T call(T a, T b) { return a | b; } 
+        static __device__ __forceinline__ T call(T a, T b) { return a | b; } 
    };


    template <typename T>
    struct BinOp<T, BIN_OP_AND>
    { 
-        static __device__ T call(T a, T b) { return a & b; } 
+        static __device__ __forceinline__ T call(T a, T b) { return a & b; } 
    };

    template <typename T>
    struct BinOp<T, BIN_OP_XOR>
    { 
-        static __device__ T call(T a, T b) { return a ^ b; } 
+        static __device__ __forceinline__ T call(T a, T b) { return a ^ b; } 
    };


@ -357,15 +357,15 @@ namespace cv { namespace gpu { namespace mathfunc
    struct MinOp
    {        
        template <typename T>
-        __device__ T operator()(T a, T b)
+        __device__ __forceinline__ T operator()(T a, T b)
        {
            return min(a, b);
        }
-        __device__ float operator()(float a, float b)
+        __device__ __forceinline__ float operator()(float a, float b)
        {
            return fmin(a, b);
        }
-        __device__ double operator()(double a, double b)
+        __device__ __forceinline__ double operator()(double a, double b)
        {
            return fmin(a, b);
        }
@ -374,15 +374,15 @@ namespace cv { namespace gpu { namespace mathfunc
    struct MaxOp
    {        
        template <typename T>
-        __device__ T operator()(T a, T b)
+        __device__ __forceinline__ T operator()(T a, T b)
        {
            return max(a, b);
        }
-        __device__ float operator()(float a, float b)
+        __device__ __forceinline__ float operator()(float a, float b)
        {
            return fmax(a, b);
        }
-        __device__ double operator()(double a, double b)
+        __device__ __forceinline__ double operator()(double a, double b)
        {
            return fmax(a, b);
        }
@ -394,7 +394,7 @@ namespace cv { namespace gpu { namespace mathfunc

        explicit ScalarMinOp(T s_) : s(s_) {}

-        __device__ T operator()(T a)
+        __device__ __forceinline__ T operator()(T a)
        {
            return min(a, s);
        }
@ -405,7 +405,7 @@ namespace cv { namespace gpu { namespace mathfunc

        explicit ScalarMinOp(float s_) : s(s_) {}

-        __device__ float operator()(float a)
+        __device__ __forceinline__ float operator()(float a)
        {
            return fmin(a, s);
        }
@ -416,7 +416,7 @@ namespace cv { namespace gpu { namespace mathfunc

        explicit ScalarMinOp(double s_) : s(s_) {}

-        __device__ double operator()(double a)
+        __device__ __forceinline__ double operator()(double a)
        {
            return fmin(a, s);
        }
@ -428,7 +428,7 @@ namespace cv { namespace gpu { namespace mathfunc

        explicit ScalarMaxOp(T s_) : s(s_) {}

-        __device__ T operator()(T a)
+        __device__ __forceinline__ T operator()(T a)
        {
            return max(a, s);
        }
@ -439,7 +439,7 @@ namespace cv { namespace gpu { namespace mathfunc

        explicit ScalarMaxOp(float s_) : s(s_) {}

-        __device__ float operator()(float a)
+        __device__ __forceinline__ float operator()(float a)
        {
            return fmax(a, s);
        }
@ -450,7 +450,7 @@ namespace cv { namespace gpu { namespace mathfunc

        explicit ScalarMaxOp(double s_) : s(s_) {}

-        __device__ double operator()(double a)
+        __device__ __forceinline__ double operator()(double a)
        {
            return fmax(a, s);
        }
@ -524,7 +524,7 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}

-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
        {
            return src > thresh ? maxVal : 0;
        }
@ -538,7 +538,7 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}

-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
        {
            return src > thresh ? 0 : maxVal;
        }
@ -552,7 +552,7 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        ThreshTrunc(T thresh_, T) : thresh(thresh_) {}

-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
        {
            return min(src, thresh);
        }
@ -564,7 +564,7 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        ThreshTrunc(float thresh_, float) : thresh(thresh_) {}

-        __device__ float operator()(const float& src) const
+        __device__ __forceinline__ float operator()(const float& src) const
        {
            return fmin(src, thresh);
        }
@ -576,7 +576,7 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        ThreshTrunc(double thresh_, double) : thresh(thresh_) {}

-        __device__ double operator()(const double& src) const
+        __device__ __forceinline__ double operator()(const double& src) const
        {
            return fmin(src, thresh);
        }
@ -590,7 +590,7 @@ namespace cv { namespace gpu { namespace mathfunc
    public:
        ThreshToZero(T thresh_, T) : thresh(thresh_) {}

-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
        {
            return src > thresh ? src : 0;
        }
@ -604,7 +604,7 @@ namespace cv { namespace gpu { namespace mathfunc
    public:
        ThreshToZeroInv(T thresh_, T) : thresh(thresh_) {}

-        __device__ T operator()(const T& src) const
+        __device__ __forceinline__ T operator()(const T& src) const
        {
            return src > thresh ? 0 : src;
        }
--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@ -406,7 +406,7 @@ namespace bf_krnls
    template <int channels>
    struct DistRgbMax
    {
-        static __device__ uchar calc(const uchar* a, const uchar* b)
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
        {
            uchar x = abs(a[0] - b[0]);
            uchar y = abs(a[1] - b[1]);
@ -418,7 +418,7 @@ namespace bf_krnls
    template <>
    struct DistRgbMax<1>
    {
-        static __device__ uchar calc(const uchar* a, const uchar* b)
+        static __device__ __forceinline__ uchar calc(const uchar* a, const uchar* b)
        {
            return abs(a[0] - b[0]);
        }
--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
@ -48,35 +48,35 @@ using namespace cv::gpu::device;

 namespace cv { namespace gpu { namespace imgproc {

-__device__ float sum(float v) { return v; }
-__device__ float sum(float2 v) { return v.x + v.y; }
-__device__ float sum(float3 v) { return v.x + v.y + v.z; }
-__device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+__device__ __forceinline__ float sum(float v) { return v; }
+__device__ __forceinline__ float sum(float2 v) { return v.x + v.y; }
+__device__ __forceinline__ float sum(float3 v) { return v.x + v.y + v.z; }
+__device__ __forceinline__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }

-__device__ float first(float v) { return v; }
-__device__ float first(float2 v) { return v.x; }
-__device__ float first(float3 v) { return v.x; }
-__device__ float first(float4 v) { return v.x; }
+__device__ __forceinline__ float first(float v) { return v; }
+__device__ __forceinline__ float first(float2 v) { return v.x; }
+__device__ __forceinline__ float first(float3 v) { return v.x; }
+__device__ __forceinline__ float first(float4 v) { return v.x; }

-__device__ float mul(float a, float b) { return a * b; }
-__device__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-__device__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-__device__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+__device__ __forceinline__ float mul(float a, float b) { return a * b; }
+__device__ __forceinline__ float2 mul(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+__device__ __forceinline__ float3 mul(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+__device__ __forceinline__ float4 mul(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }

-__device__ float mul(uchar a, uchar b) { return a * b; }
-__device__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
-__device__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
-__device__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }
+__device__ __forceinline__ float mul(uchar a, uchar b) { return a * b; }
+__device__ __forceinline__ float2 mul(uchar2 a, uchar2 b) { return make_float2(a.x * b.x, a.y * b.y); }
+__device__ __forceinline__ float3 mul(uchar3 a, uchar3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); }
+__device__ __forceinline__ float4 mul(uchar4 a, uchar4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); }

-__device__ float sub(float a, float b) { return a - b; }
-__device__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-__device__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-__device__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+__device__ __forceinline__ float sub(float a, float b) { return a - b; }
+__device__ __forceinline__ float2 sub(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+__device__ __forceinline__ float3 sub(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+__device__ __forceinline__ float4 sub(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }

-__device__ float sub(uchar a, uchar b) { return a - b; }
-__device__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
-__device__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
-__device__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }
+__device__ __forceinline__ float sub(uchar a, uchar b) { return a - b; }
+__device__ __forceinline__ float2 sub(uchar2 a, uchar2 b) { return make_float2(a.x - b.x, a.y - b.y); }
+__device__ __forceinline__ float3 sub(uchar3 a, uchar3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); }
+__device__ __forceinline__ float4 sub(uchar4 a, uchar4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); }


 template <typename T, int cn>
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@ -60,27 +60,27 @@ namespace cv { namespace gpu { namespace mathfunc
 {
    struct Nothing
    {
-        static __device__ void calc(int, int, float, float, float*, size_t, float)
+        static __device__ __forceinline__ void calc(int, int, float, float, float*, size_t, float)
        {
        }
    };
    struct Magnitude
    {
-        static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
        {
            dst[y * dst_step + x] = sqrtf(x_data * x_data + y_data * y_data);
        }
    };
    struct MagnitudeSqr
    {
-        static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
+        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float)
        {
            dst[y * dst_step + x] = x_data * x_data + y_data * y_data;
        }
    };
    struct Atan2
    {
-        static __device__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
+        static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
        {
            dst[y * dst_step + x] = scale * atan2f(y_data, x_data);
        }
@ -104,14 +104,14 @@ namespace cv { namespace gpu { namespace mathfunc

    struct NonEmptyMag
    {
-        static __device__ float get(const float* mag, size_t mag_step, int x, int y)
+        static __device__ __forceinline__ float get(const float* mag, size_t mag_step, int x, int y)
        {
            return mag[y * mag_step + x];
        }
    };
    struct EmptyMag
    {
-        static __device__ float get(const float*, size_t, int, int)
+        static __device__ __forceinline__ float get(const float*, size_t, int, int)
        {
            return 1.0f;
        }
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@ -123,14 +123,14 @@ namespace cv { namespace gpu { namespace matrix_operations {
    __constant__ float scalar_32f[4]; 
    __constant__ double scalar_64f[4];

-    template <typename T> __device__ T readScalar(int i);
-    template <> __device__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
-    template <> __device__ schar readScalar<schar>(int i) {return scalar_8s[i];}
-    template <> __device__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
-    template <> __device__ short readScalar<short>(int i) {return scalar_16s[i];}
-    template <> __device__ int readScalar<int>(int i) {return scalar_32s[i];}
-    template <> __device__ float readScalar<float>(int i) {return scalar_32f[i];}
-    template <> __device__ double readScalar<double>(int i) {return scalar_64f[i];}
+    template <typename T> __device__ __forceinline__ T readScalar(int i);
+    template <> __device__ __forceinline__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
+    template <> __device__ __forceinline__ schar readScalar<schar>(int i) {return scalar_8s[i];}
+    template <> __device__ __forceinline__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
+    template <> __device__ __forceinline__ short readScalar<short>(int i) {return scalar_16s[i];}
+    template <> __device__ __forceinline__ int readScalar<int>(int i) {return scalar_32s[i];}
+    template <> __device__ __forceinline__ float readScalar<float>(int i) {return scalar_32f[i];}
+    template <> __device__ __forceinline__ double readScalar<double>(int i) {return scalar_64f[i];}

    void writeScalar(const uchar* vals)
    {
@ -243,7 +243,7 @@ namespace cv { namespace gpu { namespace matrix_operations {
    public:
        Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}

-        __device__ D operator()(const T& src)
+        __device__ __forceinline__ D operator()(const T& src)
        {
            return saturate_cast<D>(alpha * src + beta);
        }
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace mathfunc
    {
        explicit Mask8U(PtrStep mask): mask(mask) {}

-        __device__ bool operator()(int y, int x) const 
+        __device__ __forceinline__ bool operator()(int y, int x) const 
        { 
            return mask.ptr(y)[x]; 
        }
@ -89,7 +89,7 @@ namespace cv { namespace gpu { namespace mathfunc

    struct MaskTrue 
    { 
-        __device__ bool operator()(int y, int x) const 
+        __device__ __forceinline__ bool operator()(int y, int x) const 
        { 
            return true; 
        } 
@ -153,7 +153,7 @@ namespace cv { namespace gpu { namespace mathfunc

    // Does min and max in shared memory
    template <typename T>
-    __device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
+    __device__ __forceinline__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
    {
        minval[tid] = min(minval[tid], minval[tid + offset]);
        maxval[tid] = max(maxval[tid], maxval[tid + offset]);
@ -976,16 +976,16 @@ namespace cv { namespace gpu { namespace mathfunc
    template <> struct SumType<double> { typedef double R; };

    template <typename R> 
-    struct IdentityOp { static __device__ R call(R x) { return x; } };
+    struct IdentityOp { static __device__ __forceinline__ R call(R x) { return x; } };

    template <typename R> 
-    struct AbsOp { static __device__ R call(R x) { return abs(x); } };
+    struct AbsOp { static __device__ __forceinline__ R call(R x) { return abs(x); } };

    template <>
-    struct AbsOp<uint> { static __device__ uint call(uint x) { return x; } };
+    struct AbsOp<uint> { static __device__ __forceinline__ uint call(uint x) { return x; } };

    template <typename R> 
-    struct SqrOp { static __device__ R call(R x) { return x * x; } };
+    struct SqrOp { static __device__ __forceinline__ R call(R x) { return x * x; } };

    __constant__ int ctwidth;
    __constant__ int ctheight;
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
@ -68,7 +68,7 @@ __constant__ size_t cminSSD_step;
 __constant__ int cwidth;
 __constant__ int cheight;

-__device__ int SQ(int a)
+__device__ __forceinline__ int SQ(int a)
 {
    return a * a;
 }
@ -419,7 +419,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,

 texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;

-__device__ float sobel(int x, int y)
+__device__ __forceinline__ float sobel(int x, int y)
 {
    float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
                 tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@ -76,11 +76,11 @@ namespace cv { namespace gpu { namespace bp
    template <int cn> struct PixDiff;
    template <> struct PixDiff<1>
    {
-        __device__ PixDiff(const uchar* ls)
+        __device__ __forceinline__ PixDiff(const uchar* ls)
        {
            l = *ls;
        }
-        __device__ float operator()(const uchar* rs) const
+        __device__ __forceinline__ float operator()(const uchar* rs) const
        {
            return abs((int)l - *rs);
        }
@ -88,11 +88,11 @@ namespace cv { namespace gpu { namespace bp
    };
    template <> struct PixDiff<3>
    {
-        __device__ PixDiff(const uchar* ls)
+        __device__ __forceinline__ PixDiff(const uchar* ls)
        {
            l = *((uchar3*)ls);
        }
-        __device__ float operator()(const uchar* rs) const
+        __device__ __forceinline__ float operator()(const uchar* rs) const
        {
            const float tr = 0.299f;
            const float tg = 0.587f;
@ -108,11 +108,11 @@ namespace cv { namespace gpu { namespace bp
    };
    template <> struct PixDiff<4>
    {
-        __device__ PixDiff(const uchar* ls)
+        __device__ __forceinline__ PixDiff(const uchar* ls)
        {
            l = *((uchar4*)ls);
        }
-        __device__ float operator()(const uchar* rs) const
+        __device__ __forceinline__ float operator()(const uchar* rs) const
        {
            const float tr = 0.299f;
            const float tg = 0.587f;
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
@ -102,14 +102,14 @@ namespace cv { namespace gpu { namespace csbp
    template <int channels> struct DataCostPerPixel;
    template <> struct DataCostPerPixel<1>
    {
-        static __device__ float compute(const uchar* left, const uchar* right)
+        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
        {
            return fmin(cdata_weight * abs((int)*left - *right), cdata_weight * cmax_data_term);
        }
    };
    template <> struct DataCostPerPixel<3>
    {
-        static __device__ float compute(const uchar* left, const uchar* right)
+        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
        {
            float tb = 0.114f * abs((int)left[0] - right[0]);
            float tg = 0.587f * abs((int)left[1] - right[1]);
@ -120,7 +120,7 @@ namespace cv { namespace gpu { namespace csbp
    };
    template <> struct DataCostPerPixel<4>
    {
-        static __device__ float compute(const uchar* left, const uchar* right)
+        static __device__ __forceinline__ float compute(const uchar* left, const uchar* right)
        {
            uchar4 l = *((const uchar4*)left);
            uchar4 r = *((const uchar4*)right);
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace surf
    __constant__ float c_DY [3][5] = { {2, 0, 7, 3, 1}, {2, 3, 7, 6, -2}, {2, 6, 7, 9, 1} };
    __constant__ float c_DXY[4][5] = { {1, 1, 4, 4, 1}, {5, 1, 8, 4, -1}, {1, 5, 4, 8, -1}, {5, 5, 8, 8, 1} };

-    __host__ __device__ int calcSize(int octave, int layer)
+    __host__ __device__ __forceinline__ int calcSize(int octave, int layer)
    {
        /* Wavelet size at first layer of first octave. */
        const int HAAR_SIZE0 = 9;
@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace surf
    
    struct WithOutMask
    {
-        static __device__ bool check(int, int, int)
+        static __device__ __forceinline__ bool check(int, int, int)
        {
            return true;
        }
@ -708,7 +708,7 @@ namespace cv { namespace gpu { namespace surf
        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
    };

-    __device__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
+    __device__ __forceinline__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
    {
        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@ -40,208 +40,207 @@
 //
 //M*/

+#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
+
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"

-namespace cv 
-{ 
-    namespace gpu 
+namespace cv { namespace gpu { namespace device
+{
+    struct BrdReflect101 
    {
-        namespace device
+        explicit BrdReflect101(int len): last(len - 1) {}
+
+        __device__ __forceinline__ int idx_low(int i) const
        {
-            struct BrdReflect101 
-            {
-                explicit BrdReflect101(int len): last(len - 1) {}
-
-                __device__ int idx_low(int i) const
-                {
-                    return abs(i);
-                }
-
-                __device__ int idx_high(int i) const 
-                {
-                    return last - abs(last - i);
-                }
-
-                __device__ int idx(int i) const
-                {
-                    return idx_low(idx_high(i));
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return -last <= mini && maxi <= 2 * last;
-                }
-
-            private:
-                int last;
-            };
-
-
-            template <typename D>
-            struct BrdRowReflect101: BrdReflect101
-            {
-                explicit BrdRowReflect101(int len): BrdReflect101(len) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i)]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i)]);
-                }
-            };
-
-
-            template <typename D>
-            struct BrdColReflect101: BrdReflect101
-            {
-                BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i) * step]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i) * step]);
-                }
-
-            private:
-                int step;
-            };
-
-
-            struct BrdReplicate
-            {
-                explicit BrdReplicate(int len): last(len - 1) {}
-
-                __device__ int idx_low(int i) const
-                {
-                    return max(i, 0);
-                }
-
-                __device__ int idx_high(int i) const 
-                {
-                    return min(i, last);
-                }
-
-                __device__ int idx(int i) const
-                {
-                    return idx_low(idx_high(i));
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-            private:
-                int last;
-            };
-
-
-            template <typename D>
-            struct BrdRowReplicate: BrdReplicate
-            {
-                explicit BrdRowReplicate(int len): BrdReplicate(len) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i)]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i)]);
-                }
-            };
-
-
-            template <typename D>
-            struct BrdColReplicate: BrdReplicate
-            {
-                BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_low(i) * step]);
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return saturate_cast<D>(data[idx_high(i) * step]);
-                }
-
-            private:
-                int step;
-            };
-
-            template <typename D>
-            struct BrdRowConstant
-            {
-                explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return i >= 0 ? saturate_cast<D>(data[i]) : val;
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return i < len ? saturate_cast<D>(data[i]) : val;
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-            private:
-                int len;
-                D val;
-            };
-
-            template <typename D>
-            struct BrdColConstant
-            {
-                BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
-
-                template <typename T>
-                __device__ D at_low(int i, const T* data) const 
-                {
-                    return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
-                }
-
-                template <typename T>
-                __device__ D at_high(int i, const T* data) const 
-                {
-                    return i < len ? saturate_cast<D>(data[i * step]) : val;
-                }
-
-                bool is_range_safe(int mini, int maxi) const 
-                {
-                    return true;
-                }
-
-            private:
-                int len;
-                int step;
-                D val;
-            };
+            return abs(i);
        }
-    }
-}
+
+        __device__ __forceinline__ int idx_high(int i) const 
+        {
+            return last - abs(last - i);
+        }
+
+        __device__ __forceinline__ int idx(int i) const
+        {
+            return idx_low(idx_high(i));
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return -last <= mini && maxi <= 2 * last;
+        }
+
+    private:
+        int last;
+    };
+
+
+    template <typename D>
+    struct BrdRowReflect101: BrdReflect101
+    {
+        explicit BrdRowReflect101(int len): BrdReflect101(len) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i)]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i)]);
+        }
+    };
+
+
+    template <typename D>
+    struct BrdColReflect101: BrdReflect101
+    {
+        BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i) * step]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i) * step]);
+        }
+
+    private:
+        int step;
+    };
+
+
+    struct BrdReplicate
+    {
+        explicit BrdReplicate(int len): last(len - 1) {}
+
+        __device__ __forceinline__ int idx_low(int i) const
+        {
+            return max(i, 0);
+        }
+
+        __device__ __forceinline__ int idx_high(int i) const 
+        {
+            return min(i, last);
+        }
+
+        __device__ __forceinline__ int idx(int i) const
+        {
+            return idx_low(idx_high(i));
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+    private:
+        int last;
+    };
+
+
+    template <typename D>
+    struct BrdRowReplicate: BrdReplicate
+    {
+        explicit BrdRowReplicate(int len): BrdReplicate(len) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i)]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i)]);
+        }
+    };
+
+
+    template <typename D>
+    struct BrdColReplicate: BrdReplicate
+    {
+        BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_low(i) * step]);
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_high(i) * step]);
+        }
+
+    private:
+        int step;
+    };
+
+    template <typename D>
+    struct BrdRowConstant
+    {
+        explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return i >= 0 ? saturate_cast<D>(data[i]) : val;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return i < len ? saturate_cast<D>(data[i]) : val;
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+    private:
+        int len;
+        D val;
+    };
+
+    template <typename D>
+    struct BrdColConstant
+    {
+        BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
+
+        template <typename T>
+        __device__ __forceinline__ D at_low(int i, const T* data) const 
+        {
+            return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
+        }
+
+        template <typename T>
+        __device__ __forceinline__ D at_high(int i, const T* data) const 
+        {
+            return i < len ? saturate_cast<D>(data[i * step]) : val;
+        }
+
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+
+    private:
+        int len;
+        int step;
+        D val;
+    };
+}}}
+
+#endif // __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@ -1,39 +1,105 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or bpied warranties, including, but not limited to, the bpied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
+#define __OPENCV_GPU_DATAMOV_UTILS_HPP__
+
+#include "internal_shared.hpp"

-#if __CUDA_ARCH__ >= 200
+namespace cv { namespace gpu { namespace device
+{
+    #if __CUDA_ARCH__ >= 200

-	// for Fermi memory space is detected automatically
-	template <typename T> struct ForceGlobLoad
-	{
-		__device__ __forceinline__ static void Ld(T* ptr, int offset, T& val)  { val = d_ptr[offset];  }
-	};
-		
-#else
+	    // for Fermi memory space is detected automatically
+	    template <typename T> struct ForceGlob
+	    {
+		    __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = d_ptr[offset];  }
+	    };
+    		
+    #else // __CUDA_ARCH__ >= 200

+	    #if defined(_WIN64) || defined(__LP64__)		
+		    // 64-bit register modifier for inlined asm
+		    #define _OPENCV_ASM_PTR_ "l"
+	    #else	
+		    // 32-bit register modifier for inlined asm
+		    #define _OPENCV_ASM_PTR_ "r"
+	    #endif

-	#if defined(_WIN64) || defined(__LP64__)		
-		// 64-bit register modifier for inlined asm
-		#define _OPENCV_ASM_PTR_ "l"
-	#else	
-		// 32-bit register modifier for inlined asm
-		#define _OPENCV_ASM_PTR_ "r"
-	#endif
+	    template<class T> struct ForceGlob;

-	template<class T> struct ForceGlobLoad;
+        #define DEFINE_FORCE_GLOB(base_type, ptx_type, reg_mod)                                                   \
+	    template <> struct ForceGlob<base_type>                                                                   \
+	    {                                                                                                         \
+		    __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)         \
+            {                                                                                                     \
+			    asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(ptr + offset));       \
+		    }                                                                                                     \
+	    };
+        #define DEFINE_FORCE_GLOB_B(base_type, ptx_type)                                                                          \
+	    template <> struct ForceGlob<base_type>                                                                                   \
+	    {                                                                                                                         \
+		    __device__ __forceinline__ static void Load(const base_type* ptr, int offset, base_type& val)                         \
+            {                                                                                                                     \
+			    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : _OPENCV_ASM_PTR_(ptr + offset));   \
+		    }                                                                                                                     \
+	    };
+    	
+	    DEFINE_FORCE_GLOB_B(uchar,  u8)
+	    DEFINE_FORCE_GLOB_B(schar,  s8)
+	    DEFINE_FORCE_GLOB_B(char,   b8)
+	    DEFINE_FORCE_GLOB  (ushort, u16, h)
+	    DEFINE_FORCE_GLOB  (short,  s16, h)
+	    DEFINE_FORCE_GLOB  (uint,   u32, r)
+	    DEFINE_FORCE_GLOB  (int,    s32, r)	
+	    DEFINE_FORCE_GLOB  (float,  f32, f)	
+	    DEFINE_FORCE_GLOB  (double, f64, d)	
+    		

+    #undef DEFINE_FORCE_GLOB
+    #undef DEFINE_FORCE_GLOB_B
+    #undef _OPENCV_ASM_PTR_
+    	
+    #endif // __CUDA_ARCH__ >= 200
+}}}

-#define DEFINE_FORCE_GLOB_LOAD(base_type, ptx_type, reg_mod)											      \
-	template <> struct ForceGlobLoad<base_type> 															  \
-	{                                                                                                         \
-		__device__ __forceinline__ static void Ld(type* ptr, int offset, type& val)                           \ 
-        {                                                                                                     \
-			asm("ld.global."#ptx_type" %0, [%1];" : "="#reg_mod(val) : _OPENCV_ASM_PTR_(d_ptr + offset));	  \
-		}																									  \
-	};					
-	
-	DEFINE_FORCE_GLOB_LOAD(int,   s32, r)	
-	DEFINE_FORCE_GLOB_LOAD(float, f32, f)	
-		
-
-#undef DEFINE_FORCE_GLOB_LOAD
-	
-#endif
+#endif // __OPENCV_GPU_DATAMOV_UTILS_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@ -40,44 +40,41 @@
 //
 //M*/

+#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
+#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__

-namespace cv
-{
-    namespace gpu
+namespace cv { namespace gpu { namespace device
+{   
+    template<class T> struct DynamicSharedMem
    {
-        namespace device
-        {   
-            template<class T> struct DynamicSharedMem
-            {
-                __device__ operator T*()
-                {
-                    extern __shared__ int __smem[];
-                    return (T*)__smem;
-                }
-
-                __device__ operator const T*() const
-                {
-                    extern __shared__ int __smem[];
-                    return (T*)__smem;
-                }
-            };
-
-            // specialize for double to avoid unaligned memory access compile errors
-            template<> struct DynamicSharedMem<double>
-            {
-                __device__ operator double*()
-                {
-                    extern __shared__ double __smem_d[];
-                    return (double*)__smem_d;
-                }
-
-                __device__ operator const double*() const
-                {
-                    extern __shared__ double __smem_d[];
-                    return (double*)__smem_d;
-                }
-            };
+        __device__ __forceinline__ operator T*()
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
        }

-    }
-}
+        __device__ __forceinline__ operator const T*() const
+        {
+            extern __shared__ int __smem[];
+            return (T*)__smem;
+        }
+    };
+
+    // specialize for double to avoid unaligned memory access compile errors
+    template<> struct DynamicSharedMem<double>
+    {
+        __device__ __forceinline__ operator double*()
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+
+        __device__ __forceinline__ operator const double*() const
+        {
+            extern __shared__ double __smem_d[];
+            return (double*)__smem_d;
+        }
+    };
+}}}
+
+#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits_gpu.hpp
@ -40,181 +40,179 @@
 //
 //M*/

+#ifndef __OPENCV_GPU_LIMITS_GPU_HPP__
+#define __OPENCV_GPU_LIMITS_GPU_HPP__

-namespace cv
-{
-    namespace gpu
-    {
-        namespace device
-        {     
-            template<class T> struct numeric_limits_gpu
-            {	        
-                typedef T type;
-                __device__ static type min()  { return type(); };            
-                __device__ static type max() { return type(); };
-                __device__ static type epsilon() { return type(); }
-                __device__ static type round_error() { return type(); }
-                __device__ static type denorm_min()  { return type(); }
-                __device__ static type infinity() { return type(); }
-                __device__ static type quiet_NaN() { return type(); }
-                __device__ static type signaling_NaN() { return T(); }
-                static const bool is_signed;
-            };
+namespace cv { namespace gpu { namespace device
+{     
+    template<class T> struct numeric_limits_gpu
+    {	        
+        typedef T type;
+        __device__ __forceinline__ static type min()  { return type(); };            
+        __device__ __forceinline__ static type max() { return type(); };
+        __device__ __forceinline__ static type epsilon() { return type(); }
+        __device__ __forceinline__ static type round_error() { return type(); }
+        __device__ __forceinline__ static type denorm_min()  { return type(); }
+        __device__ __forceinline__ static type infinity() { return type(); }
+        __device__ __forceinline__ static type quiet_NaN() { return type(); }
+        __device__ __forceinline__ static type signaling_NaN() { return T(); }
+        static const bool is_signed;
+    };

-            template<> struct numeric_limits_gpu<bool>
-            {	        
-                typedef bool type;
-                __device__ static type min() { return false; };            
-                __device__ static type max() { return true;  };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<bool>
+    {	        
+        typedef bool type;
+        __device__ __forceinline__ static type min() { return false; };            
+        __device__ __forceinline__ static type max() { return true;  };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };

-            template<> struct numeric_limits_gpu<char>
-            {	        
-                typedef char type;
-                __device__ static type min() { return CHAR_MIN; };            
-                __device__ static type max() { return CHAR_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = (char)-1 == -1;
-            };
+    template<> struct numeric_limits_gpu<char>
+    {	        
+        typedef char type;
+        __device__ __forceinline__ static type min() { return CHAR_MIN; };            
+        __device__ __forceinline__ static type max() { return CHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = (char)-1 == -1;
+    };

-            template<> struct numeric_limits_gpu<unsigned char>
-            {	        
-                typedef unsigned char type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return UCHAR_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<unsigned char>
+    {	        
+        typedef unsigned char type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return UCHAR_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };

-            template<> struct numeric_limits_gpu<short>
-            {	
-                typedef short type;
-                __device__ static type min() { return SHRT_MIN; };            
-                __device__ static type max() { return SHRT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<short>
+    {	
+        typedef short type;
+        __device__ __forceinline__ static type min() { return SHRT_MIN; };            
+        __device__ __forceinline__ static type max() { return SHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };

-            template<> struct numeric_limits_gpu<unsigned short>
-            {	        
-                typedef unsigned short type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return USHRT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<unsigned short>
+    {	        
+        typedef unsigned short type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return USHRT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };

-            template<> struct numeric_limits_gpu<int>
-            {	  
-                typedef int type;
-                __device__ static type min() { return INT_MIN; };            
-                __device__ static type max() { return INT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<int>
+    {	  
+        typedef int type;
+        __device__ __forceinline__ static type min() { return INT_MIN; };            
+        __device__ __forceinline__ static type max() { return INT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };


-            template<> struct numeric_limits_gpu<unsigned int>
-            {	        
-                typedef unsigned int type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return UINT_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
+    template<> struct numeric_limits_gpu<unsigned int>
+    {	        
+        typedef unsigned int type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return UINT_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };

-            template<> struct numeric_limits_gpu<long>
-            {	        
-                typedef long type;
-                __device__ static type min() { return LONG_MIN; };            
-                __device__ static type max() { return LONG_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<long>
+    {	        
+        typedef long type;
+        __device__ __forceinline__ static type min() { return LONG_MIN; };            
+        __device__ __forceinline__ static type max() { return LONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };

-            template<> struct numeric_limits_gpu<unsigned long>
-            {	        
-                typedef unsigned long type;
-                __device__ static type min() { return 0; };            
-                __device__ static type max() { return ULONG_MAX; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = false;
-            };
-                        
-            template<> struct numeric_limits_gpu<float>
-            {	        
-                typedef float type;
-                __device__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };            
-                __device__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
-                __device__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };
+    template<> struct numeric_limits_gpu<unsigned long>
+    {	        
+        typedef unsigned long type;
+        __device__ __forceinline__ static type min() { return 0; };            
+        __device__ __forceinline__ static type max() { return ULONG_MAX; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = false;
+    };
+                
+    template<> struct numeric_limits_gpu<float>
+    {	        
+        typedef float type;
+        __device__ __forceinline__ static type min() { return 1.175494351e-38f/*FLT_MIN*/; };            
+        __device__ __forceinline__ static type max() { return 3.402823466e+38f/*FLT_MAX*/; };
+        __device__ __forceinline__ static type epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };

-            template<> struct numeric_limits_gpu<double>
-            {	        
-                typedef double type;
-                __device__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };            
-                __device__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
-                __device__ static type epsilon();
-                __device__ static type round_error();
-                __device__ static type denorm_min();
-                __device__ static type infinity();
-                __device__ static type quiet_NaN();
-                __device__ static type signaling_NaN();
-                static const bool is_signed = true;
-            };            
-        }
-    }
-}
+    template<> struct numeric_limits_gpu<double>
+    {	        
+        typedef double type;
+        __device__ __forceinline__ static type min() { return 2.2250738585072014e-308/*DBL_MIN*/; };            
+        __device__ __forceinline__ static type max() { return 1.7976931348623158e+308/*DBL_MAX*/; };
+        __device__ __forceinline__ static type epsilon();
+        __device__ __forceinline__ static type round_error();
+        __device__ __forceinline__ static type denorm_min();
+        __device__ __forceinline__ static type infinity();
+        __device__ __forceinline__ static type quiet_NaN();
+        __device__ __forceinline__ static type signaling_NaN();
+        static const bool is_signed = true;
+    };            
+}}}
+
+#endif // __OPENCV_GPU_LIMITS_GPU_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@ -51,29 +51,29 @@ namespace cv
    {
        namespace device
        {
-            template<typename _Tp> static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(short v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(int v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(float v) { return _Tp(v); }
-            template<typename _Tp> static __device__ _Tp saturate_cast(double v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(schar v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(short v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(uint v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(int v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(float v) { return _Tp(v); }
+            template<typename _Tp> static __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }

-            template<> static __device__ uchar saturate_cast<uchar>(schar v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
            { return (uchar)max((int)v, 0); }
-            template<> static __device__ uchar saturate_cast<uchar>(ushort v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
            { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-            template<> static __device__ uchar saturate_cast<uchar>(int v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
            { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-            template<> static __device__ uchar saturate_cast<uchar>(uint v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
            { return (uchar)min(v, (uint)UCHAR_MAX); }
-            template<> static __device__ uchar saturate_cast<uchar>(short v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
            { return saturate_cast<uchar>((uint)v); }

-            template<> static __device__ uchar saturate_cast<uchar>(float v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
            { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-            template<> static __device__ uchar saturate_cast<uchar>(double v)
+            template<> static __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
            {
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
@ -82,23 +82,23 @@ namespace cv
            #endif
            }

-            template<> static __device__ schar saturate_cast<schar>(uchar v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
            { return (schar)min((int)v, SCHAR_MAX); }
-            template<> static __device__ schar saturate_cast<schar>(ushort v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
            { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-            template<> static __device__ schar saturate_cast<schar>(int v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(int v)
            {
                return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
                            v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
            }
-            template<> static __device__ schar saturate_cast<schar>(short v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(short v)
            { return saturate_cast<schar>((int)v); }
-            template<> static __device__ schar saturate_cast<schar>(uint v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(uint v)
            { return (schar)min(v, (uint)SCHAR_MAX); }

-            template<> static __device__ schar saturate_cast<schar>(float v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(float v)
            { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-            template<> static __device__ schar saturate_cast<schar>(double v)
+            template<> static __device__ __forceinline__ schar saturate_cast<schar>(double v)
            {             
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
@ -107,17 +107,17 @@ namespace cv
            #endif
            }

-            template<> static __device__ ushort saturate_cast<ushort>(schar v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
            { return (ushort)max((int)v, 0); }
-            template<> static __device__ ushort saturate_cast<ushort>(short v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
            { return (ushort)max((int)v, 0); }
-            template<> static __device__ ushort saturate_cast<ushort>(int v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
            { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-            template<> static __device__ ushort saturate_cast<ushort>(uint v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
            { return (ushort)min(v, (uint)USHRT_MAX); }
-            template<> static __device__ ushort saturate_cast<ushort>(float v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
            { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-            template<> static __device__ ushort saturate_cast<ushort>(double v)
+            template<> static __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
            {             
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
@ -126,18 +126,18 @@ namespace cv
            #endif
            }

-            template<> static __device__ short saturate_cast<short>(ushort v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(ushort v)
            { return (short)min((int)v, SHRT_MAX); }
-            template<> static __device__ short saturate_cast<short>(int v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(int v)
            {
                return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
                        v : v > 0 ? SHRT_MAX : SHRT_MIN);
            }
-            template<> static __device__ short saturate_cast<short>(uint v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(uint v)
            { return (short)min(v, (uint)SHRT_MAX); }
-            template<> static __device__ short saturate_cast<short>(float v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(float v)
            { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-            template<> static __device__ short saturate_cast<short>(double v)
+            template<> static __device__ __forceinline__ short saturate_cast<short>(double v)
            {            
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                int iv = __double2int_rn(v); return saturate_cast<short>(iv);
@ -146,8 +146,8 @@ namespace cv
            #endif
            }

-            template<> static __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-            template<> static __device__ int saturate_cast<int>(double v) 
+            template<> static __device__ __forceinline__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+            template<> static __device__ __forceinline__ int saturate_cast<int>(double v) 
            {
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
                return __double2int_rn(v);
@ -156,8 +156,8 @@ namespace cv
            #endif
            }

-            template<> static __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-            template<> static __device__ uint saturate_cast<uint>(double v) 
+            template<> static __device__ __forceinline__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+            template<> static __device__ __forceinline__ uint saturate_cast<uint>(double v) 
            {            
            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
                return __double2uint_rn(v);
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device
    public:
        explicit MaskReader(const PtrStep& mask_): mask(mask_) {}

-        __device__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }
+        __device__ __forceinline__ bool operator()(int y, int x) const { return mask.ptr(y)[x]; }

    private:
        PtrStep mask;
@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device

    struct NoMask 
    {
-        __device__ bool operator()(int y, int x) const { return true; } 
+        __device__ __forceinline__ bool operator()(int y, int x) const { return true; } 
    };

    //! Read Write Traits
@ -121,14 +121,14 @@ namespace cv { namespace gpu { namespace device
    template <> struct OpUnroller<1>
    {
        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src.x);
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src1.x, src2.x);
@ -137,7 +137,7 @@ namespace cv { namespace gpu { namespace device
    template <> struct OpUnroller<2>
    {
        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src.x);
@ -146,7 +146,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src1.x, src2.x);
@ -157,7 +157,7 @@ namespace cv { namespace gpu { namespace device
    template <> struct OpUnroller<3>
    {
        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src.x);
@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src1.x, src2.x);
@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace device
    template <> struct OpUnroller<4>
    {
        template <typename T, typename D, typename UnOp, typename Mask>
-        static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src.x);
@ -194,7 +194,7 @@ namespace cv { namespace gpu { namespace device
        }

        template <typename T1, typename T2, typename D, typename BinOp, typename Mask>
-        static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
+        static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y)
        {
            if (mask(y, x_shifted))
                dst.x = op(src1.x, src2.x);
--- a/modules/gpu/src/opencv2/gpu/device/vecmath.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vecmath.hpp