diff --git a/modules/core/include/opencv2/core/cuda_devptrs.hpp b/modules/core/include/opencv2/core/cuda_devptrs.hpp
index 373ff290b..72a897b48 100644
--- a/modules/core/include/opencv2/core/cuda_devptrs.hpp
+++ b/modules/core/include/opencv2/core/cuda_devptrs.hpp
@@ -177,6 +177,20 @@ namespace cv
 //#undef __CV_GPU_DEPR_BEFORE__
 //#undef __CV_GPU_DEPR_AFTER__
 
+        namespace device
+        {
+            using cv::gpu::PtrSz;
+            using cv::gpu::PtrStep;
+            using cv::gpu::PtrStepSz;
+
+            using cv::gpu::PtrStepSzb;
+            using cv::gpu::PtrStepSzf;
+            using cv::gpu::PtrStepSzi;
+
+            using cv::gpu::PtrStepb;
+            using cv::gpu::PtrStepf;
+            using cv::gpu::PtrStepi;
+        }
     }
 }
 
diff --git a/modules/core/src/gpumat.cpp b/modules/core/src/gpumat.cpp
index d1f965a32..8c7a4708d 100644
--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -315,18 +315,6 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
 
 namespace
 {
-    template <class T> void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-    {
-        *attribute = T();
-        //CUresult error = CUDA_SUCCESS;// = cuDeviceGetAttribute( attribute, device_attribute, device ); why link erros under ubuntu??
-        CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );
-        if( CUDA_SUCCESS == error )
-            return;
-
-        printf("Driver API error = %04d\n", error);
-        cv::gpu::error("driver API error", __FILE__, __LINE__);
-    }
-
     int convertSMVer2Cores(int major, int minor)
     {
         // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
@@ -335,7 +323,7 @@ namespace
             int Cores;
         } SMtoCores;
 
-        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, { -1, -1 }  };
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, { -1, -1 }  };
 
         int index = 0;
         while (gpuArchCoresPerSM[index].SM != -1)
@@ -344,7 +332,7 @@ namespace
                 return gpuArchCoresPerSM[index].Cores;
             index++;
         }
-        printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
+
         return -1;
     }
 }
@@ -382,22 +370,13 @@ void cv::gpu::printCudaDeviceInfo(int device)
         printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
         printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);
         printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);
-        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n",
-            prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor),
-            convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, cores, cores * prop.multiProcessorCount);
+
         printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
 
-        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
-        int memoryClock, memBusWidth, L2CacheSize;
-        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );
-        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );
-        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
-
-        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
-        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
-        if (L2CacheSize)
-            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
-
         printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
             prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
             prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
@@ -457,7 +436,12 @@ void cv::gpu::printShortCudaDeviceInfo(int device)
 
         const char *arch_str = prop.major < 2 ? " (not Fermi)" : "";
         printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);
-        printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);
+        printf(", sm_%d%d%s", prop.major, prop.minor, arch_str);
+
+        int cores = convertSMVer2Cores(prop.major, prop.minor);
+        if (cores > 0)
+            printf(", %d cores", cores * prop.multiProcessorCount);
+
         printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
     }
     fflush(stdout);
diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt
index 580e7dfb9..cf69ba125 100644
--- a/modules/gpu/CMakeLists.txt
+++ b/modules/gpu/CMakeLists.txt
@@ -52,13 +52,11 @@ if (HAVE_CUDA)
   set(cuda_link_libs ${CUDA_LIBRARIES} ${CUDA_npp_LIBRARY})
 
   if(NOT APPLE)
-    unset(CUDA_nvcuvid_LIBRARY CACHE)
     find_cuda_helper_libs(nvcuvid)
     set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvid_LIBRARY})
   endif()
 
   if(WIN32)
-    unset(CUDA_nvcuvenc_LIBRARY CACHE)
     find_cuda_helper_libs(nvcuvenc)
     set(cuda_link_libs ${cuda_link_libs} ${CUDA_nvcuvenc_LIBRARY})
   endif()
diff --git a/modules/gpu/doc/matrix_reductions.rst b/modules/gpu/doc/matrix_reductions.rst
index ee2250a39..538267eb7 100644
--- a/modules/gpu/doc/matrix_reductions.rst
+++ b/modules/gpu/doc/matrix_reductions.rst
@@ -185,7 +185,7 @@ Reduces a matrix to a vector.
             * **CV_REDUCE_MIN** The output is the minimum (column/row-wise) of all rows/columns of the matrix.
 
     :param dtype: When it is negative, the destination vector will have the same type as the source matrix. Otherwise, its type will be  ``CV_MAKE_TYPE(CV_MAT_DEPTH(dtype), mtx.channels())`` .
-    
+
 The function ``reduce`` reduces the matrix to a vector by treating the matrix rows/columns as a set of 1D vectors and performing the specified operation on the vectors until a single row/column is obtained. For example, the function can be used to compute horizontal and vertical projections of a raster image. In case of ``CV_REDUCE_SUM`` and ``CV_REDUCE_AVG`` , the output may have a larger element bit-depth to preserve accuracy. And multi-channel arrays are also supported in these two reduction modes.
 
 .. seealso:: :ocv:func:`reduce`
diff --git a/modules/gpu/src/opencv2/gpu/device/block.hpp b/modules/gpu/include/opencv2/gpu/device/block.hpp
similarity index 100%
rename from modules/gpu/src/opencv2/gpu/device/block.hpp
rename to modules/gpu/include/opencv2/gpu/device/block.hpp
diff --git a/modules/gpu/include/opencv2/gpu/device/common.hpp b/modules/gpu/include/opencv2/gpu/device/common.hpp
index 141467fdc..7fb1036e8 100644
--- a/modules/gpu/include/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/common.hpp
@@ -85,8 +85,6 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
         cv::gpu::error(cudaGetErrorString(err), file, line, func);
 }
 
-#ifdef __CUDACC__
-
 namespace cv { namespace gpu
 {
     __host__ __device__ __forceinline__ int divUp(int total, int grain)
@@ -96,19 +94,25 @@ namespace cv { namespace gpu
 
     namespace device
     {
+        using cv::gpu::divUp;
+
+#ifdef __CUDACC__
         typedef unsigned char uchar;
         typedef unsigned short ushort;
         typedef signed char schar;
-        typedef unsigned int uint;
+        #ifdef _WIN32
+            typedef unsigned int uint;
+        #endif
 
         template<class T> inline void bindTexture(const textureReference* tex, const PtrStepSz<T>& img)
         {
             cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
             cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
         }
+#endif // __CUDACC__
     }
 }}
 
-#endif // __CUDACC__
+
 
 #endif // __OPENCV_GPU_COMMON_HPP__
diff --git a/modules/gpu/include/opencv2/gpu/device/emulation.hpp b/modules/gpu/include/opencv2/gpu/device/emulation.hpp
index 074e91127..b6fba230e 100644
--- a/modules/gpu/include/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/emulation.hpp
@@ -44,7 +44,6 @@
 #define OPENCV_GPU_EMULATION_HPP_
 
 #include "warp_reduce.hpp"
-#include <stdio.h>
 
 namespace cv { namespace gpu { namespace device
 {
diff --git a/modules/gpu/include/opencv2/gpu/device/functional.hpp b/modules/gpu/include/opencv2/gpu/device/functional.hpp
index c601cf527..6064e8e99 100644
--- a/modules/gpu/include/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/functional.hpp
@@ -302,18 +302,18 @@ namespace cv { namespace gpu { namespace device
     template <> struct name<type> : binary_function<type, type, type> \
     { \
         __device__ __forceinline__ type operator()(type lhs, type rhs) const {return op(lhs, rhs);} \
-        __device__ __forceinline__ name(const name& other):binary_function<type, type, type>(){}\
-        __device__ __forceinline__ name():binary_function<type, type, type>(){}\
+        __device__ __forceinline__ name() {}\
+        __device__ __forceinline__ name(const name&) {}\
     };
 
     template <typename T> struct maximum : binary_function<T, T, T>
     {
         __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
         {
-            return lhs < rhs ? rhs : lhs;
+            return max(lhs, rhs);
         }
-        __device__ __forceinline__ maximum(const maximum& other):binary_function<T, T, T>(){}
-        __device__ __forceinline__ maximum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ maximum() {}
+        __device__ __forceinline__ maximum(const maximum&) {}
     };
 
     OPENCV_GPU_IMPLEMENT_MINMAX(maximum, uchar, ::max)
@@ -330,10 +330,10 @@ namespace cv { namespace gpu { namespace device
     {
         __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
         {
-            return lhs < rhs ? lhs : rhs;
+            return min(lhs, rhs);
         }
-        __device__ __forceinline__ minimum(const minimum& other):binary_function<T, T, T>(){}
-        __device__ __forceinline__ minimum():binary_function<T, T, T>(){}
+        __device__ __forceinline__ minimum() {}
+        __device__ __forceinline__ minimum(const minimum&) {}
     };
 
     OPENCV_GPU_IMPLEMENT_MINMAX(minimum, uchar, ::min)
@@ -350,6 +350,108 @@ namespace cv { namespace gpu { namespace device
 
     // Math functions
 ///bound=========================================
+
+    template <typename T> struct abs_func : unary_function<T, T>
+    {
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType x) const
+        {
+            return abs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned char> : unary_function<unsigned char, unsigned char>
+    {
+        __device__ __forceinline__ unsigned char operator ()(unsigned char x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<signed char> : unary_function<signed char, signed char>
+    {
+        __device__ __forceinline__ signed char operator ()(signed char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<char> : unary_function<char, char>
+    {
+        __device__ __forceinline__ char operator ()(char x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned short> : unary_function<unsigned short, unsigned short>
+    {
+        __device__ __forceinline__ unsigned short operator ()(unsigned short x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<short> : unary_function<short, short>
+    {
+        __device__ __forceinline__ short operator ()(short x) const
+        {
+            return ::abs((int)x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<unsigned int> : unary_function<unsigned int, unsigned int>
+    {
+        __device__ __forceinline__ unsigned int operator ()(unsigned int x) const
+        {
+            return x;
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<int> : unary_function<int, int>
+    {
+        __device__ __forceinline__ int operator ()(int x) const
+        {
+            return ::abs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<float> : unary_function<float, float>
+    {
+        __device__ __forceinline__ float operator ()(float x) const
+        {
+            return ::fabsf(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+    template <> struct abs_func<double> : unary_function<double, double>
+    {
+        __device__ __forceinline__ double operator ()(double x) const
+        {
+            return ::fabs(x);
+        }
+
+        __device__ __forceinline__ abs_func() {}
+        __device__ __forceinline__ abs_func(const abs_func&) {}
+    };
+
 #define OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(name, func) \
     template <typename T> struct name ## _func : unary_function<T, float> \
     { \
@@ -357,6 +459,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func ## f(v); \
         } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     }; \
     template <> struct name ## _func<double> : unary_function<double, double> \
     { \
@@ -364,6 +468,8 @@ namespace cv { namespace gpu { namespace device
         { \
             return func(v); \
         } \
+        __device__ __forceinline__ name ## _func() {} \
+        __device__ __forceinline__ name ## _func(const name ## _func&) {} \
     };
 
 #define OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR(name, func) \
@@ -382,7 +488,6 @@ namespace cv { namespace gpu { namespace device
         } \
     };
 
-    OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(fabs, ::fabs)
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(sqrt, ::sqrt)
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp, ::exp)
     OPENCV_GPU_IMPLEMENT_UN_FUNCTOR(exp2, ::exp2)
diff --git a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
index 0ec790c0b..1c46dc0c3 100644
--- a/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/include/opencv2/gpu/device/vec_math.hpp
@@ -280,7 +280,7 @@ namespace cv { namespace gpu { namespace device
     OPENCV_GPU_IMPLEMENT_VEC_UNOP (type, operator ! , logical_not) \
     OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, max, maximum) \
     OPENCV_GPU_IMPLEMENT_VEC_BINOP(type, min, minimum) \
-    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, fabs, fabs_func) \
+    OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, abs, abs_func) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, sqrt, sqrt_func) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp, exp_func) \
     OPENCV_GPU_IMPLEMENT_VEC_UNOP(type, exp2, exp2_func) \
@@ -327,4 +327,4 @@ namespace cv { namespace gpu { namespace device
     #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
 }}} // namespace cv { namespace gpu { namespace device
 
-#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
+#endif // __OPENCV_GPU_VECMATH_HPP__
diff --git a/modules/gpu/src/brute_force_matcher.cpp b/modules/gpu/src/brute_force_matcher.cpp
index 0843c7d1a..bc1818886 100644
--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
@@ -268,14 +268,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
     const float* distance_ptr =  distance.ptr<float>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
     {
-        int _trainIdx = *trainIdx_ptr;
+        int train_idx = *trainIdx_ptr;
 
-        if (_trainIdx == -1)
+        if (train_idx == -1)
             continue;
 
-        float _distance = *distance_ptr;
+        float distance_local = *distance_ptr;
 
-        DMatch m(queryIdx, _trainIdx, 0, _distance);
+        DMatch m(queryIdx, train_idx, 0, distance_local);
 
         matches.push_back(m);
     }
@@ -413,16 +413,16 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
     const float* distance_ptr =  distance.ptr<float>();
     for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
     {
-        int trainIdx = *trainIdx_ptr;
+        int _trainIdx = *trainIdx_ptr;
 
-        if (trainIdx == -1)
+        if (_trainIdx == -1)
             continue;
 
-        int imgIdx = *imgIdx_ptr;
+        int _imgIdx = *imgIdx_ptr;
 
-        float distance = *distance_ptr;
+        float _distance = *distance_ptr;
 
-        DMatch m(queryIdx, trainIdx, imgIdx, distance);
+        DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
 
         matches.push_back(m);
     }
@@ -548,13 +548,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
 
         for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
         {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;
 
-            if (trainIdx != -1)
+            if (_trainIdx != -1)
             {
-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;
 
-                DMatch m(queryIdx, trainIdx, 0, distance);
+                DMatch m(queryIdx, _trainIdx, 0, _distance);
 
                 curMatches.push_back(m);
             }
@@ -667,15 +667,15 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch2Convert(const Mat& trainIdx,
 
         for (int i = 0; i < 2; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
         {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;
 
-            if (trainIdx != -1)
+             if (_trainIdx != -1)
             {
-                int imgIdx = *imgIdx_ptr;
+                int _imgIdx = *imgIdx_ptr;
 
-                float distance = *distance_ptr;
+                float _distance = *distance_ptr;
 
-                DMatch m(queryIdx, trainIdx, imgIdx, distance);
+                DMatch m(queryIdx, _trainIdx, _imgIdx, _distance);
 
                 curMatches.push_back(m);
             }
@@ -852,25 +852,25 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
         const int* trainIdx_ptr = trainIdx.ptr<int>(queryIdx);
         const float* distance_ptr = distance.ptr<float>(queryIdx);
 
-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
 
-        if (nMatches == 0)
+        if (nMatched == 0)
         {
             if (!compactResult)
                 matches.push_back(vector<DMatch>());
             continue;
         }
 
-        matches.push_back(vector<DMatch>(nMatches));
+        matches.push_back(vector<DMatch>(nMatched));
         vector<DMatch>& curMatches = matches.back();
 
-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++distance_ptr)
         {
-            int trainIdx = *trainIdx_ptr;
+            int _trainIdx = *trainIdx_ptr;
 
-            float distance = *distance_ptr;
+            float _distance = *distance_ptr;
 
-            DMatch m(queryIdx, trainIdx, 0, distance);
+            DMatch m(queryIdx, _trainIdx, 0, _distance);
 
             curMatches[i] = m;
         }
@@ -990,9 +990,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
         const int* imgIdx_ptr = imgIdx.ptr<int>(queryIdx);
         const float* distance_ptr = distance.ptr<float>(queryIdx);
 
-        const int nMatches = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
+        const int nMatched = std::min(nMatches_ptr[queryIdx], trainIdx.cols);
 
-        if (nMatches == 0)
+        if (nMatched == 0)
         {
             if (!compactResult)
                 matches.push_back(vector<DMatch>());
@@ -1001,9 +1001,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
 
         matches.push_back(vector<DMatch>());
         vector<DMatch>& curMatches = matches.back();
-        curMatches.reserve(nMatches);
+        curMatches.reserve(nMatched);
 
-        for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
+        for (int i = 0; i < nMatched; ++i, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
         {
             int _trainIdx = *trainIdx_ptr;
             int _imgIdx = *imgIdx_ptr;
diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp
index f2aa1a4b7..ac839ce96 100644
--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -622,7 +622,7 @@ private:
         }
 
         // copy data structures on gpu
-        stage_mat.upload(cv::Mat(1, stages.size() * sizeof(Stage), CV_8UC1, (uchar*)&(stages[0]) ));
+        stage_mat.upload(cv::Mat(1, (int) (stages.size() * sizeof(Stage)), CV_8UC1, (uchar*)&(stages[0]) ));
         trees_mat.upload(cv::Mat(cl_trees).reshape(1,1));
         nodes_mat.upload(cv::Mat(cl_nodes).reshape(1,1));
         leaves_mat.upload(cv::Mat(cl_leaves).reshape(1,1));
diff --git a/modules/gpu/src/cuda/ccomponetns.cu b/modules/gpu/src/cuda/ccomponetns.cu
index 62e81376a..c094e08c0 100644
--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -497,6 +497,7 @@ namespace cv { namespace gpu { namespace device
 
         void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
         {
+            (void) flags;
             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
             dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
 
@@ -529,4 +530,4 @@ namespace cv { namespace gpu { namespace device
     }
 } } }
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/gftt.cu b/modules/gpu/src/cuda/gftt.cu
index 4c21bd911..cae217e39 100644
--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -47,6 +47,7 @@
 
 #if !defined CUDA_DISABLER
 
+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
 
 #include "opencv2/gpu/device/common.hpp"
@@ -148,4 +149,4 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/hough.cu b/modules/gpu/src/cuda/hough.cu
index ee4d02591..8e35aa858 100644
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -42,7 +42,9 @@
 
 #if !defined CUDA_DISABLER
 
+#include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/emulation.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
@@ -1509,4 +1511,4 @@ namespace cv { namespace gpu { namespace device
 }}}
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/lbp.cu b/modules/gpu/src/cuda/lbp.cu
index 55f5d7512..a09aa1e79 100644
--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -295,7 +295,7 @@ namespace cv { namespace gpu { namespace device
             int grid = divUp(workAmount, block);
             cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
             Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
-            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), integral.step / sizeof(int), objects, classified);
+            lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
         }
     }
 }}}
diff --git a/modules/gpu/src/cuda/mathfunc.cu b/modules/gpu/src/cuda/mathfunc.cu
index 516ea37a8..3ae8fdc7b 100644
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -76,7 +76,7 @@ namespace cv { namespace gpu { namespace device
             static __device__ __forceinline__ void calc(int x, int y, float x_data, float y_data, float* dst, size_t dst_step, float scale)
             {
                 float angle = ::atan2f(y_data, x_data);
-                angle += (angle < 0) * 2.0 * CV_PI;
+                angle += (angle < 0) * 2.0f * CV_PI_F;
                 dst[y * dst_step + x] = scale * angle;
             }
         };
@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(x.cols, threads.x);
             grid.y = divUp(x.rows, threads.y);
 
-            const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
+            const float scale = angleInDegrees ? (180.0f / CV_PI_F) : 1.f;
 
             cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
                 x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
@@ -190,7 +190,7 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(mag.cols, threads.x);
             grid.y = divUp(mag.rows, threads.y);
 
-            const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
+            const float scale = angleInDegrees ? (CV_PI_F / 180.0f) : 1.0f;
 
             polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
                 angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
@@ -214,4 +214,4 @@ namespace cv { namespace gpu { namespace device
     } // namespace mathfunc
 }}} // namespace cv { namespace gpu { namespace device
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/optical_flow.cu b/modules/gpu/src/cuda/optical_flow.cu
index 0c8d140f9..d2c68a79f 100644
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -164,40 +164,40 @@ namespace cv { namespace gpu { namespace device
 
                 r = ::fmin(r, 2.5f);
 
-                v[1].x = arrow_x + r * ::cosf(theta - CV_PI / 2.0f);
-                v[1].y = arrow_y + r * ::sinf(theta - CV_PI / 2.0f);
+                v[1].x = arrow_x + r * ::cosf(theta - CV_PI_F / 2.0f);
+                v[1].y = arrow_y + r * ::sinf(theta - CV_PI_F / 2.0f);
 
-                v[4].x = arrow_x + r * ::cosf(theta + CV_PI / 2.0f);
-                v[4].y = arrow_y + r * ::sinf(theta + CV_PI / 2.0f);
+                v[4].x = arrow_x + r * ::cosf(theta + CV_PI_F / 2.0f);
+                v[4].y = arrow_y + r * ::sinf(theta + CV_PI_F / 2.0f);
 
                 int indx = (y * u_avg.cols + x) * NUM_VERTS_PER_ARROW * 3;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[0].x * xscale;
                 vertex_data[indx++] = v[0].y * yscale;
                 vertex_data[indx++] = v[0].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[1].x * xscale;
                 vertex_data[indx++] = v[1].y * yscale;
                 vertex_data[indx++] = v[1].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[2].x * xscale;
                 vertex_data[indx++] = v[2].y * yscale;
                 vertex_data[indx++] = v[2].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[3].x * xscale;
                 vertex_data[indx++] = v[3].y * yscale;
                 vertex_data[indx++] = v[3].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[4].x * xscale;
                 vertex_data[indx++] = v[4].y * yscale;
                 vertex_data[indx++] = v[4].z;
 
-                color_data[indx] = (theta - CV_PI) / CV_PI * 180.0f;
+                color_data[indx] = (theta - CV_PI_F) / CV_PI_F * 180.0f;
                 vertex_data[indx++] = v[5].x * xscale;
                 vertex_data[indx++] = v[5].y * yscale;
                 vertex_data[indx++] = v[5].z;
@@ -217,4 +217,4 @@ namespace cv { namespace gpu { namespace device
     }
 }}}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/optical_flow_farneback.cu b/modules/gpu/src/cuda/optical_flow_farneback.cu
index 8231775b8..5bbca34f1 100644
--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -42,7 +42,6 @@
 
 #if !defined CUDA_DISABLER
 
-#include <stdio.h>
 #include "internal_shared.hpp"
 #include "opencv2/gpu/device/common.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"
@@ -57,8 +56,6 @@
 #define BORDER_SIZE 5
 #define MAX_KSIZE_HALF 100
 
-using namespace std;
-
 namespace cv { namespace gpu { namespace device { namespace optflow_farneback
 {
     __constant__ float c_g[8];
diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu
index d1a65c210..811c3b90b 100644
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -267,7 +267,7 @@ namespace cv { namespace gpu { namespace device
         }
         __device__ __forceinline__ float4 abs_(const float4& a)
         {
-            return fabs(a);
+            return abs(a);
         }
 
         template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
@@ -681,4 +681,4 @@ namespace cv { namespace gpu { namespace device
     }
 }}}
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/split_merge.cu b/modules/gpu/src/cuda/split_merge.cu
index a62c76bbe..834b283f0 100644
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -508,4 +508,4 @@ namespace cv { namespace gpu { namespace device
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index 736aa3f1f..18d3ae797 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -454,7 +454,7 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(cols, threads.x << 1);
             grid.y = divUp(rows, threads.y);
 
-            int elem_step = u.step/sizeof(T);
+            int elem_step = (int)(u.step / sizeof(T));
 
             for(int t = 0; t < iters; ++t)
             {
diff --git a/modules/gpu/src/cuda/surf.cu b/modules/gpu/src/cuda/surf.cu
index 37c4eb48a..b37f47f09 100644
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -638,7 +638,7 @@ namespace cv { namespace gpu { namespace device
                 kp_dir *= 180.0f / CV_PI_F;
 
                 kp_dir = 360.0f - kp_dir;
-                if (abs(kp_dir - 360.f) < FLT_EPSILON)
+                if (::fabsf(kp_dir - 360.f) < FLT_EPSILON)
                     kp_dir = 0.f;
 
                 featureDir[blockIdx.x] = kp_dir;
@@ -1003,4 +1003,4 @@ namespace cv { namespace gpu { namespace device
 }}} // namespace cv { namespace gpu { namespace device
 
 
-#endif /* CUDA_DISABLER */
\ No newline at end of file
+#endif /* CUDA_DISABLER */
diff --git a/modules/gpu/src/cuda/texture_binder.hpp b/modules/gpu/src/cuda/texture_binder.hpp
index 4f42b099d..391eb9a19 100644
--- a/modules/gpu/src/cuda/texture_binder.hpp
+++ b/modules/gpu/src/cuda/texture_binder.hpp
@@ -85,7 +85,7 @@ namespace cv
 
   namespace device
   {
-      using pcl::gpu::TextureBinder;
+      using cv::gpu::TextureBinder;
   }
 }
 
diff --git a/modules/gpu/src/fast.cpp b/modules/gpu/src/fast.cpp
index d09210da4..f8b3b9887 100644
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
@@ -125,9 +125,6 @@ int cv::gpu::FAST_GPU::calcKeyPointsLocation(const GpuMat& img, const GpuMat& ma
     CV_Assert(img.type() == CV_8UC1);
     CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == img.size()));
 
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     int maxKeypoints = static_cast<int>(keypointsRatio * img.size().area());
 
     ensureSizeIsEnough(1, maxKeypoints, CV_16SC2, kpLoc_);
@@ -148,9 +145,6 @@ int cv::gpu::FAST_GPU::getKeyPoints(GpuMat& keypoints)
 {
     using namespace cv::gpu::device::fast;
 
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     if (count_ == 0)
         return 0;
 
diff --git a/modules/gpu/src/gftt.cpp b/modules/gpu/src/gftt.cpp
index 0c8f165ae..6bb73de75 100644
--- a/modules/gpu/src/gftt.cpp
+++ b/modules/gpu/src/gftt.cpp
@@ -68,9 +68,6 @@ void cv::gpu::GoodFeaturesToTrackDetector_GPU::operator ()(const GpuMat& image,
     CV_Assert(qualityLevel > 0 && minDistance >= 0 && maxCorners >= 0);
     CV_Assert(mask.empty() || (mask.type() == CV_8UC1 && mask.size() == image.size()));
 
-    if (!TargetArchs::builtWith(GLOBAL_ATOMICS) || !DeviceInfo().supports(GLOBAL_ATOMICS))
-        CV_Error(CV_StsNotImplemented, "The device doesn't support global atomics");
-
     ensureSizeIsEnough(image.size(), CV_32F, eig_);
 
     if (useHarrisDetector)
diff --git a/modules/gpu/src/nvidia/core/NCV.cu b/modules/gpu/src/nvidia/core/NCV.cu
index 5d1b5d12c..77e59cc5c 100644
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -45,8 +45,6 @@
 #include <vector>
 #include "NCV.hpp"
 
-using namespace std;
-
 
 //==============================================================================
 //
@@ -55,16 +53,16 @@ using namespace std;
 //==============================================================================
 
 
-static void stdDebugOutput(const string &msg)
+static void stdDebugOutput(const std::string &msg)
 {
-    cout << msg;
+    std::cout << msg;
 }
 
 
 static NCVDebugOutputHandler *debugOutputHandler = stdDebugOutput;
 
 
-void ncvDebugOutput(const string &msg)
+void ncvDebugOutput(const std::string &msg)
 {
     debugOutputHandler(msg);
 }
diff --git a/modules/gpu/src/nvidia/core/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp
index ddac47c92..703cb827b 100644
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -288,7 +288,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
     do \
     { \
         cudaError_t res = cudacall; \
-        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
+        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
     } while (0)
 
 
@@ -296,7 +296,7 @@ NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func);
     do \
     { \
         cudaError_t res = cudaGetLastError(); \
-        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << res, errCode); \
+        ncvAssertPrintReturn(cudaSuccess==res, "cudaError_t=" << (int)res, errCode); \
     } while (0)
 
 
diff --git a/samples/gpu/cascadeclassifier_nvidia_api.cpp b/samples/gpu/cascadeclassifier_nvidia_api.cpp
index da98643af..99c95ab97 100644
--- a/samples/gpu/cascadeclassifier_nvidia_api.cpp
+++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp
@@ -30,7 +30,7 @@ const Size2i preferredVideoFrameSize(640, 480);
 const string wndTitle = "NVIDIA Computer Vision :: Haar Classifiers Cascade";
 
 
-void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
+static void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 {
     int fontFace = FONT_HERSHEY_DUPLEX;
     double fontScale = 0.8;
@@ -45,7 +45,7 @@ void matPrint(Mat &img, int lineOffsY, Scalar fontColor, const string &ss)
 }
 
 
-void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
+static void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bFilter, double fps)
 {
     Scalar fontColorRed = CV_RGB(255,0,0);
     Scalar fontColorNV  = CV_RGB(118,185,0);
@@ -74,7 +74,7 @@ void displayState(Mat &canvas, bool bHelp, bool bGpu, bool bLargestFace, bool bF
 }
 
 
-NCVStatus process(Mat *srcdst,
+static NCVStatus process(Mat *srcdst,
                   Ncv32u width, Ncv32u height,
                   NcvBool bFilterRects, NcvBool bLargestFace,
                   HaarClassifierCascadeDescriptor &haar,
@@ -281,7 +281,7 @@ int main(int argc, const char** argv)
     //==============================================================================
 
     namedWindow(wndTitle, 1);
-    Mat gray, frameDisp;
+    Mat frameDisp;
 
     do
     {
diff --git a/samples/gpu/driver_api_multi.cpp b/samples/gpu/driver_api_multi.cpp
index 560908ce6..2d743f0e9 100644
--- a/samples/gpu/driver_api_multi.cpp
+++ b/samples/gpu/driver_api_multi.cpp
@@ -54,14 +54,8 @@ inline void safeCall_(int code, const char* expr, const char* file, int line)
 // Each GPU is associated with its own context
 CUcontext contexts[2];
 
-int main(int argc, char **argv)
+int main()
 {
-    if (argc > 1)
-    {
-        cout << "CUDA driver API sample\n";
-        return -1;
-    }
-
     int num_devices = getCudaEnabledDeviceCount();
     if (num_devices < 2)
     {
diff --git a/samples/gpu/driver_api_stereo_multi.cpp b/samples/gpu/driver_api_stereo_multi.cpp
index 3c663a5e5..b8f99e810 100644
--- a/samples/gpu/driver_api_stereo_multi.cpp
+++ b/samples/gpu/driver_api_stereo_multi.cpp
@@ -76,7 +76,7 @@ GpuMat d_result[2];
 // CPU result
 Mat result;
 
-void printHelp()
+static void printHelp()
 {
     std::cout << "Usage: driver_api_stereo_multi_gpu --left <left_image> --right <right_image>\n";
 }
diff --git a/samples/gpu/opticalflow_nvidia_api.cpp b/samples/gpu/opticalflow_nvidia_api.cpp
index 8a149d740..05a37ef69 100644
--- a/samples/gpu/opticalflow_nvidia_api.cpp
+++ b/samples/gpu/opticalflow_nvidia_api.cpp
@@ -59,7 +59,7 @@ public:
 class RgbToR
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char /*g*/, unsigned char r)
     {
         return static_cast<float>(r)/255.0f;
     }
@@ -69,7 +69,7 @@ public:
 class RgbToG
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char /*b*/, unsigned char g, unsigned char /*r*/)
     {
         return static_cast<float>(g)/255.0f;
     }
@@ -78,7 +78,7 @@ public:
 class RgbToB
 {
 public:
-    float operator ()(unsigned char b, unsigned char g, unsigned char r)
+    float operator ()(unsigned char b, unsigned char /*g*/, unsigned char /*r*/)
     {
         return static_cast<float>(b)/255.0f;
     }
@@ -135,7 +135,7 @@ NCVStatus CopyData(const IplImage *image, const NCVMatrixAlloc<Ncv32f> &dst)
     return NCV_SUCCESS;
 }
 
-NCVStatus LoadImages (const char *frame0Name,
+static NCVStatus LoadImages (const char *frame0Name,
                       const char *frame1Name,
                       int &width,
                       int &height,
@@ -186,7 +186,7 @@ inline T MapValue (T x, T a, T b, T c, T d)
     return c + (d - c) * (x - a) / (b - a);
 }
 
-NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
+static NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const char *name)
 {
     IplImage *flowField;
 
@@ -246,7 +246,7 @@ NCVStatus ShowFlow (NCVMatrixAlloc<Ncv32f> &u, NCVMatrixAlloc<Ncv32f> &v, const
     return NCV_SUCCESS;
 }
 
-IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
+static IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g, NCVMatrixAlloc<Ncv32f> &h_b)
 {
     CvSize imageSize = cvSize (h_r.width (), h_r.height ());
     IplImage *image  = cvCreateImage (imageSize, IPL_DEPTH_8U, 4);
@@ -270,7 +270,7 @@ IplImage *CreateImage (NCVMatrixAlloc<Ncv32f> &h_r, NCVMatrixAlloc<Ncv32f> &h_g,
     return image;
 }
 
-void PrintHelp ()
+static void PrintHelp ()
 {
     std::cout << "Usage help:\n";
     std::cout << std::setiosflags(std::ios::left);
@@ -286,7 +286,7 @@ void PrintHelp ()
     std::cout << "\t" << std::setw(15) << PARAM_HELP << " - display this help message\n";
 }
 
-int ProcessCommandLine(int argc, char **argv,
+static int ProcessCommandLine(int argc, char **argv,
                        Ncv32f &timeStep,
                        char *&frame0Name,
                        char *&frame1Name,
diff --git a/samples/gpu/stereo_multi.cpp b/samples/gpu/stereo_multi.cpp
index c7fa5539b..d424bf90b 100644
--- a/samples/gpu/stereo_multi.cpp
+++ b/samples/gpu/stereo_multi.cpp
@@ -47,7 +47,7 @@ GpuMat d_result[2];
 // CPU result
 Mat result;
 
-void printHelp()
+static void printHelp()
 {
     std::cout << "Usage: stereo_multi_gpu --left <image> --right <image>\n";
 }