merged GPU scan

2012-06-28 16:00:38 +00:00
parent 6cca6a4548
commit 2777ebb8a0
25 changed files with 515 additions and 267 deletions
--- a/cmake/OpenCVDetectCUDA.cmake
+++ b/cmake/OpenCVDetectCUDA.cmake
@@ -1,8 +1,8 @@
 if(${CMAKE_VERSION} VERSION_LESS "2.8.3")
  message(STATUS WITH_CUDA flag requires CMake 2.8.3. CUDA support is disabled.)
-  return()  
+  return()
 endif()
-  
+
 find_package(CUDA 4.1)
 if(CUDA_FOUND)
@@ -23,7 +23,7 @@ if(CUDA_FOUND)
  else()
    set(CUDA_ARCH_BIN "1.1 1.2 1.3 2.0 2.1(2.0)" CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
  endif()
-    
+
  set(CUDA_ARCH_PTX "2.0" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
  string(REGEX REPLACE "\\." "" ARCH_BIN_NO_POINTS "${CUDA_ARCH_BIN}")
@@ -89,8 +89,8 @@ if(CUDA_FOUND)
      set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fno-finite-math-only)
    endif()
-    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1) 
+    # we remove -ggdb3 flag as it leads to preprocessor errors when compiling CUDA files (CUDA 4.1)
-    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG}) 
+    set(CMAKE_CXX_FLAGS_DEBUG_ ${CMAKE_CXX_FLAGS_DEBUG})
    string(REPLACE "-ggdb3" "" CMAKE_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
    CUDA_COMPILE(${VAR} ${ARGN})
    set(CMAKE_CXX_DEBUG_FLAGS ${CMAKE_CXX_FLAGS_DEBUG_})
--- a/modules/gpu/perf/perf_imgproc.cpp
+++ b/modules/gpu/perf/perf_imgproc.cpp
@@ -90,6 +90,40 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
                    Interpolation(cv::INTER_CUBIC),   Interpolation(cv::INTER_AREA)),
    testing::Values(Scale(0.5), Scale(0.3), Scale(2.0))));
 GPU_PERF_TEST(ResizeArea, cv::gpu::DeviceInfo, cv::Size, MatType, Scale)
 {
    cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
    cv::gpu::setDevice(devInfo.deviceID());
    cv::Size size = GET_PARAM(1);
    int type = GET_PARAM(2);
    int interpolation = cv::INTER_AREA;
    double f = GET_PARAM(3);
    cv::Mat src_host(size, type);
    fill(src_host, 0, 255);
    cv::gpu::GpuMat src(src_host);
    cv::gpu::GpuMat dst;
    cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
    declare.time(1.0);
    TEST_CYCLE()
    {
        cv::gpu::resize(src, dst, cv::Size(), f, f, interpolation);
    }
 }
 INSTANTIATE_TEST_CASE_P(ImgProc, ResizeArea, testing::Combine(
    ALL_DEVICES,
    testing::Values(perf::sz1080p/*, cv::Size(4096, 2048)*/),
    testing::Values(MatType(CV_8UC1), MatType(CV_8UC3), MatType(CV_8UC4),
                    MatType(CV_16UC1), MatType(CV_16UC3), MatType(CV_16UC4),
                    MatType(CV_32FC1), MatType(CV_32FC3), MatType(CV_32FC4)),
    testing::Values(Scale(0.2),Scale(0.1),Scale(0.05))));
 //////////////////////////////////////////////////////////////////////
 // WarpAffine
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -72,7 +72,7 @@ namespace cv { namespace gpu { namespace device
        struct Mask8U
        {
-            explicit Mask8U(PtrStepb mask): mask(mask) {}
+            explicit Mask8U(PtrStepb mask_): mask(mask_) {}
            __device__ __forceinline__ bool operator()(int y, int x) const
            {
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
@@ -46,7 +46,8 @@
 #include "opencv2/gpu/device/vec_math.hpp"
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/filters.hpp"
-# include <cfloat>
+#include <cfloat>
 #include <opencv2/gpu/device/scan.hpp>
 namespace cv { namespace gpu { namespace device
 {
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
@@ -228,9 +228,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC2_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
+            dim3 block(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            mergeC2_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    dst.rows, dst.cols, dst.data, dst.step);
@@ -244,9 +244,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC3_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
+            dim3 block(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            mergeC3_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    src[2].data, src[2].step,
@@ -261,9 +261,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void mergeC4_(const DevMem2Db* src, DevMem2Db& dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
+            dim3 block(32, 8);
-            dim3 gridDim(divUp(dst.cols, blockDim.x), divUp(dst.rows, blockDim.y));
+            dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-            mergeC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            mergeC4_<T><<<grid, block, 0, stream>>>(
                    src[0].data, src[0].step,
                    src[1].data, src[1].step,
                    src[2].data, src[2].step,
@@ -437,9 +437,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC2_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
+            dim3 block(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC2_<T><<<gridDim, blockDim, 0, stream>>>(
+            splitC2_<T><<<grid, block, 0, stream>>>(
                    src.data, src.step, src.rows, src.cols,
                    dst[0].data, dst[0].step,
                    dst[1].data, dst[1].step);
@@ -453,9 +453,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC3_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
+            dim3 block(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC3_<T><<<gridDim, blockDim, 0, stream>>>(
+            splitC3_<T><<<grid, block, 0, stream>>>(
                    src.data, src.step, src.rows, src.cols,
                    dst[0].data, dst[0].step,
                    dst[1].data, dst[1].step,
@@ -470,9 +470,9 @@ namespace cv { namespace gpu { namespace device
        template <typename T>
        static void splitC4_(const DevMem2Db& src, DevMem2Db* dst, const cudaStream_t& stream)
        {
-            dim3 blockDim(32, 8);
+            dim3 block(32, 8);
-            dim3 gridDim(divUp(src.cols, blockDim.x), divUp(src.rows, blockDim.y));
+            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-            splitC4_<T><<<gridDim, blockDim, 0, stream>>>(
+            splitC4_<T><<<grid, block, 0, stream>>>(
                     src.data, src.step, src.rows, src.cols,
                     dst[0].data, dst[0].step,
                     dst[1].data, dst[1].step,
--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
@@ -1,7 +1,7 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 
+// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-// 
+//
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
@@ -129,9 +129,9 @@ texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
 __global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
 {
    const int pos = blockIdx.x*blockDim.x + threadIdx.x;
-    
+
    if(pos >= len) return;
-    
+
    d_res[pos] = d_op1[pos] + d_op2[pos];
 }
@@ -265,7 +265,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
 ///////////////////////////////////////////////////////////////////////////////
 template<int tex_id>
 __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
-{    
+{
    //position within shared memory array
    const int ijs = js * PSOR_PITCH + is;
    //mirror reflection across borders
@@ -299,7 +299,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex> 
+template<int tex>
 __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
 {
    const int i = threadIdx.x + 2;
@@ -381,7 +381,7 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
 /// \param gamma (in) gamma in Brox model (edge importance)
 ///////////////////////////////////////////////////////////////////////////////
-__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, 
+__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
                                                        float *denominator_u, float *denominator_v,
                                                        float *numerator_dudv,
                                                        float *numerator_u, float *numerator_v,
@@ -532,16 +532,16 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
 // Red-Black SOR
 /////////////////////////////////////////////////////////////////////////////////////////
-template<int isBlack> __global__ void sor_pass(float *new_du, 
+template<int isBlack> __global__ void sor_pass(float *new_du,
-                                               float *new_dv, 
+                                               float *new_dv,
-                                               const float *g_inv_denominator_u, 
+                                               const float *g_inv_denominator_u,
                                               const float *g_inv_denominator_v,
-                                               const float *g_numerator_u, 
+                                               const float *g_numerator_u,
-                                               const float *g_numerator_v, 
+                                               const float *g_numerator_v,
-                                               const float *g_numerator_dudv, 
+                                               const float *g_numerator_dudv,
-                                               float omega, 
+                                               float omega,
-                                               int width, 
+                                               int width,
-                                               int height, 
+                                               int height,
                                               int stride)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -604,7 +604,7 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
    if((i+j)%2 == isBlack)
    {
        // update du
-        float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - 
+        float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
                             u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
        du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
@@ -644,7 +644,7 @@ void InitTextures()
    initTexture2D(tex_I1);
    initTexture2D(tex_fine);      // for downsampling
    initTexture2D(tex_coarse);    // for prolongation
-    
+
    initTexture2D(tex_Ix);
    initTexture2D(tex_Ixx);
    initTexture2D(tex_Ix0);
@@ -725,7 +725,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
    const Ncv32u kSourceHeight = frame0.height();
    ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
-    ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && 
+    ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
        uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
    ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
@@ -780,7 +780,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
    SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
    // temporary storage
-    SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, 
+    SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
        alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
    // image derivatives
@@ -800,7 +800,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
    {
        const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
-        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, 
+        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
            cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
        InitTextures();
@@ -827,10 +827,10 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        size_t src_width_in_bytes = kSourceWidth * sizeof(float);
        size_t src_pitch_in_bytes = frame0.pitch();
-        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), 
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
-        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), 
+        ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
            src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
    }
@@ -876,11 +876,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            NcvRect32u dstROI (0, 0, level_width, level_height);
            // frame 0
-            ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, 
+            ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
                level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
            // frame 1
-            ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, 
+            ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
                level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
        }
@@ -956,14 +956,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            dim3 dThreads(32, 6);
            const int kPitchTex = kLevelStride * sizeof(float);
-        
+
            NcvSize32u srcSize(kLevelWidth, kLevelHeight);
            Ncv32u nSrcStep = kLevelStride * sizeof(float);
            NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
            // Ix0
            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
-                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
            // Iy0
            ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
@@ -987,8 +987,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            // Ixy
            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
-                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 
+                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
-      
+
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
@@ -1017,21 +1017,21 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            {
                //compute coefficients
                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
-                    (diffusivity_x.ptr(), 
+                    (diffusivity_x.ptr(),
-                     diffusivity_y.ptr(), 
+                     diffusivity_y.ptr(),
-                     denom_u.ptr(), 
+                     denom_u.ptr(),
-                     denom_v.ptr(), 
+                     denom_v.ptr(),
-                     num_dudv.ptr(), 
+                     num_dudv.ptr(),
-                     num_u.ptr(), 
+                     num_u.ptr(),
-                     num_v.ptr(), 
+                     num_v.ptr(),
-                     kLevelWidth, 
+                     kLevelWidth,
-                     kLevelHeight, 
+                     kLevelHeight,
-                     kLevelStride, 
+                     kLevelStride,
-                     alpha, 
+                     alpha,
                     gamma);
                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
-                    
+
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
@@ -1043,7 +1043,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
-                
+
                //    linear system coefficients
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
@@ -1055,26 +1055,26 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            
+
                //solve linear system
                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                {
                    float omega = 1.99f;
-                
+
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
-                        (du_new.ptr(), 
+                        (du_new.ptr(),
-                        dv_new.ptr(), 
+                        dv_new.ptr(),
-                        denom_u.ptr(), 
+                        denom_u.ptr(),
                        denom_v.ptr(),
-                        num_u.ptr(), 
+                        num_u.ptr(),
-                        num_v.ptr(), 
+                        num_v.ptr(),
-                        num_dudv.ptr(), 
+                        num_dudv.ptr(),
-                        omega, 
+                        omega,
-                        kLevelWidth, 
+                        kLevelWidth,
-                        kLevelHeight, 
+                        kLevelHeight,
                        kLevelStride);
                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
@@ -1083,16 +1083,16 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
-                        (du.ptr(), 
+                        (du.ptr(),
-                        dv.ptr(), 
+                        dv.ptr(),
-                        denom_u.ptr(), 
+                        denom_u.ptr(),
                        denom_v.ptr(),
-                        num_u.ptr(), 
+                        num_u.ptr(),
-                        num_v.ptr(), 
+                        num_v.ptr(),
-                        num_dudv.ptr(), 
+                        num_dudv.ptr(),
-                        omega, 
+                        omega,
-                        kLevelWidth, 
+                        kLevelWidth,
-                        kLevelHeight, 
+                        kLevelHeight,
                        kLevelStride);
                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
@@ -1120,19 +1120,19 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
                dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
                dim3 p_threads(32, 8);
-            
+
-                NcvSize32u srcSize (kLevelWidth, kLevelHeight);
+                NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
                NcvSize32u dstSize (nw, nh);
                NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
                NcvRect32u dstROI (0, 0, nw, nh);
-                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, 
+                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
                    ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
                ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
-                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), srcSize, kLevelStride * sizeof (float), srcROI, 
+                ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
                    ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
                ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
@@ -1148,11 +1148,11 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
        ncvAssertCUDAReturn( cudaMemcpy2DAsync
-            (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 
+            (uOut.ptr(), uOut.pitch(), ptrU->ptr(),
            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
        ncvAssertCUDAReturn( cudaMemcpy2DAsync
-            (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 
+            (vOut.ptr(), vOut.pitch(), ptrV->ptr(),
            kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -252,7 +252,7 @@ NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
 //===================================================================
-NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
+NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment_)
    :
    currentSize(0),
    _maxSize(0),
@@ -260,23 +260,23 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
    begin(NULL),
    end(NULL),
    _memType(NCVMemoryTypeNone),
-    _alignment(alignment),
+    _alignment(alignment_),
    bReusesMemory(false)
 {
-    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
 }
-NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr)
+NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment_, void *reusePtr)
    :
    currentSize(0),
    _maxSize(0),
    allocBegin(NULL),
    _memType(memT),
-    _alignment(alignment)
+    _alignment(alignment_)
 {
-    NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
+    NcvBool bProperAlignment = (alignment_ & (alignment_ - 1)) == 0;
    ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: _alignment not power of 2");
    ncvAssertPrintCheck(memT != NCVMemoryTypeNone, "NCVMemStackAllocator ctor:: Incorrect allocator type");
@@ -425,12 +425,12 @@ size_t NCVMemStackAllocator::maxSize(void) const
 //===================================================================
-NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment)
+NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment_)
    :
    currentSize(0),
    _maxSize(0),
    _memType(memT),
-    _alignment(alignment)
+    _alignment(alignment_)
 {
    ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", );
 }
--- a/modules/gpu/src/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/common.hpp
@@ -64,7 +64,7 @@
    #define cudaSafeCall(expr)  ___cudaSafeCall(expr, __FILE__, __LINE__)
 #endif
-namespace cv { namespace gpu 
+namespace cv { namespace gpu
 {
    void error(const char *error_string, const char *file, const int line, const char *func);
@@ -87,14 +87,14 @@ static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int
 #ifdef __CUDACC__
-namespace cv { namespace gpu 
+namespace cv { namespace gpu
-{     
+{
-    __host__ __device__ __forceinline__ int divUp(int total, int grain) 
+    __host__ __device__ __forceinline__ int divUp(int total, int grain)
-    { 
+    {
-        return (total + grain - 1) / grain; 
+        return (total + grain - 1) / grain;
    }
-    namespace device 
+    namespace device
    {
        typedef unsigned char uchar;
        typedef unsigned short ushort;
--- a/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
@@ -45,7 +45,7 @@
 #include "common.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
@@ -54,13 +54,13 @@ namespace cv { namespace gpu { namespace device
        {
            __device__ __forceinline__ static void Load(const T* ptr, int offset, T& val)  { val = ptr[offset];  }
        };
    #else // __CUDA_ARCH__ >= 200        
-        #if defined(_WIN64) || defined(__LP64__)		
+    #else // __CUDA_ARCH__ >= 200
        #if defined(_WIN64) || defined(__LP64__)
            // 64-bit register modifier for inlined asm
            #define OPENCV_GPU_ASM_PTR "l"
-        #else	
+        #else
            // 32-bit register modifier for inlined asm
            #define OPENCV_GPU_ASM_PTR "r"
        #endif
@@ -84,21 +84,21 @@ namespace cv { namespace gpu { namespace device
                    asm("ld.global."#ptx_type" %0, [%1];" : "=r"(*reinterpret_cast<uint*>(&val)) : OPENCV_GPU_ASM_PTR(ptr + offset)); \
                } \
            };
-        
+
            OPENCV_GPU_DEFINE_FORCE_GLOB_B(uchar,  u8)
            OPENCV_GPU_DEFINE_FORCE_GLOB_B(schar,  s8)
            OPENCV_GPU_DEFINE_FORCE_GLOB_B(char,   b8)
            OPENCV_GPU_DEFINE_FORCE_GLOB  (ushort, u16, h)
            OPENCV_GPU_DEFINE_FORCE_GLOB  (short,  s16, h)
            OPENCV_GPU_DEFINE_FORCE_GLOB  (uint,   u32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)	
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (int,    s32, r)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)	
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (float,  f32, f)
-            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)	            
+            OPENCV_GPU_DEFINE_FORCE_GLOB  (double, f64, d)
        #undef OPENCV_GPU_DEFINE_FORCE_GLOB
        #undef OPENCV_GPU_DEFINE_FORCE_GLOB_B
        #undef OPENCV_GPU_ASM_PTR
-        
+
    #endif // __CUDA_ARCH__ >= 200
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
@@ -44,7 +44,7 @@
 #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
 namespace cv { namespace gpu { namespace device
-{   
+{
    template<class T> struct DynamicSharedMem
    {
        __device__ __forceinline__ operator T*()
--- a/modules/gpu/src/opencv2/gpu/device/emulation.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/emulation.hpp
@@ -45,21 +45,21 @@
 #include "warp_reduce.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    struct Emulation
    {
-	    static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
+        static __forceinline__ __device__ int Ballot(int predicate, volatile int* cta_buffer)
-	    {
+        {
    #if __CUDA_ARCH__ >= 200
-		    (void)cta_buffer;
+            (void)cta_buffer;
-		    return __ballot(predicate);
+            return __ballot(predicate);
    #else
-		    int tid = threadIdx.x;				
+            int tid = threadIdx.x;
-		    cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+            cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
-		    return warp_reduce(cta_buffer);
+            return warp_reduce(cta_buffer);
    #endif
-	    }
+        }
    };
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
@@ -46,14 +46,14 @@
 #include <cstdio>
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    template<class Func> 
+    template<class Func>
    void printFuncAttrib(Func& func)
    {
        cudaFuncAttributes attrs;
-        cudaFuncGetAttributes(&attrs, func);  
+        cudaFuncGetAttributes(&attrs, func);
        printf("=== Function stats ===\n");
        printf("Name: \n");
@@ -65,7 +65,7 @@ namespace cv { namespace gpu { namespace device
        printf("ptxVersion         = %d\n", attrs.ptxVersion);
        printf("binaryVersion      = %d\n", attrs.binaryVersion);
        printf("\n");
-        fflush(stdout); 
+        fflush(stdout);
    }
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/opencv2/gpu/device/functional.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/functional.hpp
@@ -48,7 +48,7 @@
 #include "vec_traits.hpp"
 #include "type_traits.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    // Function Objects
@@ -257,7 +257,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T> struct bit_not : unary_function<T, T>
    {
-        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const 
+        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType v) const
        {
            return ~v;
        }
@@ -268,7 +268,7 @@ namespace cv { namespace gpu { namespace device
    // Generalized Identity Operations
    template <typename T> struct identity : unary_function<T, T>
    {
-        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const 
+        __device__ __forceinline__ typename TypeTraits<T>::ParameterType operator()(typename TypeTraits<T>::ParameterType x) const
        {
            return x;
        }
@@ -278,7 +278,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T1, typename T2> struct project1st : binary_function<T1, T2, T1>
    {
-        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        __device__ __forceinline__ typename TypeTraits<T1>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
        {
            return lhs;
        }
@@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T1, typename T2> struct project2nd : binary_function<T1, T2, T2>
    {
-        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const 
+        __device__ __forceinline__ typename TypeTraits<T2>::ParameterType operator()(typename TypeTraits<T1>::ParameterType lhs, typename TypeTraits<T2>::ParameterType rhs) const
        {
            return rhs;
        }
@@ -308,7 +308,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T> struct maximum : binary_function<T, T, T>
    {
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
            return lhs < rhs ? rhs : lhs;
        }
@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
    template <typename T> struct minimum : binary_function<T, T, T>
    {
-        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const 
+        __device__ __forceinline__ T operator()(typename TypeTraits<T>::ParameterType lhs, typename TypeTraits<T>::ParameterType rhs) const
        {
            return lhs < rhs ? lhs : rhs;
        }
@@ -410,12 +410,14 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_UN_FUNCTOR
    #undef OPENCV_GPU_IMPLEMENT_BIN_FUNCTOR
-    template<typename T> struct hypot_sqr_func : binary_function<T, T, float> 
+    template<typename T> struct hypot_sqr_func : binary_function<T, T, float>
    {
        __device__ __forceinline__ T operator ()(typename TypeTraits<T>::ParameterType src1, typename TypeTraits<T>::ParameterType src2) const
        {
            return src1 * src1 + src2 * src2;
        }
        __device__ __forceinline__ hypot_sqr_func(const hypot_sqr_func& other) : binary_function<T, T, float>(){}
        __device__ __forceinline__ hypot_sqr_func() : binary_function<T, T, float>(){}
    };
    // Saturate Cast Functor
@@ -438,6 +440,7 @@ namespace cv { namespace gpu { namespace device
        {
            return (src > thresh) * maxVal;
        }
        __device__ __forceinline__ thresh_binary_func(const thresh_binary_func& other)
            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
@@ -455,6 +458,7 @@ namespace cv { namespace gpu { namespace device
        {
            return (src <= thresh) * maxVal;
        }
        __device__ __forceinline__ thresh_binary_inv_func(const thresh_binary_inv_func& other)
            : unary_function<T, T>(), thresh(other.thresh), maxVal(other.maxVal){}
@@ -519,12 +523,16 @@ namespace cv { namespace gpu { namespace device
      explicit __host__ __device__ __forceinline__ unary_negate(const Predicate& p) : pred(p) {}
      __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::argument_type>::ParameterType x) const
-      { 
+      {
-          return !pred(x); 
+          return !pred(x);
      }
        __device__ __forceinline__ unary_negate(const unary_negate& other) : unary_function<typename Predicate::argument_type, bool>(){}
        __device__ __forceinline__ unary_negate() : unary_function<typename Predicate::argument_type, bool>(){}
      const Predicate pred;
    };
    template <typename Predicate> __host__ __device__ __forceinline__ unary_negate<Predicate> not1(const Predicate& pred)
    {
        return unary_negate<Predicate>(pred);
@@ -534,19 +542,26 @@ namespace cv { namespace gpu { namespace device
    {
        explicit __host__ __device__ __forceinline__ binary_negate(const Predicate& p) : pred(p) {}
-        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x, typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
+        __device__ __forceinline__ bool operator()(typename TypeTraits<typename Predicate::first_argument_type>::ParameterType x,
-        { 
+                                                   typename TypeTraits<typename Predicate::second_argument_type>::ParameterType y) const
-            return !pred(x,y); 
+        {
            return !pred(x,y);
        }
        __device__ __forceinline__ binary_negate(const binary_negate& other)
        : binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
        __device__ __forceinline__ binary_negate() :
        binary_function<typename Predicate::first_argument_type, typename Predicate::second_argument_type, bool>(){}
        const Predicate pred;
    };
    template <typename BinaryPredicate> __host__ __device__ __forceinline__ binary_negate<BinaryPredicate> not2(const BinaryPredicate& pred)
    {
        return binary_negate<BinaryPredicate>(pred);
    }
-    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type> 
+    template <typename Op> struct binder1st : unary_function<typename Op::second_argument_type, typename Op::result_type>
    {
        __host__ __device__ __forceinline__ binder1st(const Op& op_, const typename Op::first_argument_type& arg1_) : op(op_), arg1(arg1_) {}
@@ -555,15 +570,19 @@ namespace cv { namespace gpu { namespace device
            return op(arg1, a);
        }
        __device__ __forceinline__ binder1st(const binder1st& other) :
        unary_function<typename Op::second_argument_type, typename Op::result_type>(){}
        const Op op;
        const typename Op::first_argument_type arg1;
    };
    template <typename Op, typename T> __host__ __device__ __forceinline__ binder1st<Op> bind1st(const Op& op, const T& x)
    {
        return binder1st<Op>(op, typename Op::first_argument_type(x));
    }
-    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type> 
+    template <typename Op> struct binder2nd : unary_function<typename Op::first_argument_type, typename Op::result_type>
    {
        __host__ __device__ __forceinline__ binder2nd(const Op& op_, const typename Op::second_argument_type& arg2_) : op(op_), arg2(arg2_) {}
@@ -572,16 +591,19 @@ namespace cv { namespace gpu { namespace device
            return op(a, arg2);
        }
         __device__ __forceinline__ binder2nd(const binder2nd& other) :
        unary_function<typename Op::first_argument_type, typename Op::result_type>(), op(other.op), arg2(other.arg2){}
        const Op op;
        const typename Op::second_argument_type arg2;
    };
    template <typename Op, typename T> __host__ __device__ __forceinline__ binder2nd<Op> bind2nd(const Op& op, const T& x)
    {
        return binder2nd<Op>(op, typename Op::second_argument_type(x));
    }
    // Functor Traits
    template <typename F> struct IsUnaryFunction
    {
        typedef char Yes;
@@ -618,7 +640,7 @@ namespace cv { namespace gpu { namespace device
        {
            enum { shift = UnOpShift<sizeof(T), sizeof(D)>::shift };
        };
-        
+
        template <size_t src_elem_size1, size_t src_elem_size2, size_t dst_elem_size> struct BinOpShift { enum { shift = 1 }; };
        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 1> { enum { shift = 4 }; };
        template <size_t src_elem_size1, size_t src_elem_size2> struct BinOpShift<src_elem_size1, src_elem_size2, 2> { enum { shift = 2 }; };
--- a/modules/gpu/src/opencv2/gpu/device/limits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/limits.hpp
@@ -46,7 +46,7 @@
 #include <limits>
 #include "common.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template<class T> struct numeric_limits
    {
--- a/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
@@ -57,35 +57,35 @@ namespace cv { namespace gpu { namespace device
    template<typename _Tp> __device__ __forceinline__ _Tp saturate_cast(double v) { return _Tp(v); }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(schar v)
-    { 
+    {
-        return (uchar) ::max((int)v, 0); 
+        return (uchar) ::max((int)v, 0);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(ushort v)
-    { 
+    {
-        return (uchar) ::min((uint)v, (uint)UCHAR_MAX); 
+        return (uchar) ::min((uint)v, (uint)UCHAR_MAX);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(int v)
-    { 
+    {
-        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); 
+        return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(uint v)
-    { 
+    {
-        return (uchar) ::min(v, (uint)UCHAR_MAX); 
+        return (uchar) ::min(v, (uint)UCHAR_MAX);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(short v)
-    { 
+    {
-        return saturate_cast<uchar>((uint)v); 
+        return saturate_cast<uchar>((uint)v);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(float v)
-    { 
+    {
-        int iv = __float2int_rn(v); 
+        int iv = __float2int_rn(v);
-        return saturate_cast<uchar>(iv); 
+        return saturate_cast<uchar>(iv);
    }
    template<> __device__ __forceinline__ uchar saturate_cast<uchar>(double v)
    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<uchar>(iv);
    #else
        return saturate_cast<uchar>((float)v);
@@ -93,35 +93,35 @@ namespace cv { namespace gpu { namespace device
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uchar v)
-    { 
+    {
-        return (schar) ::min((int)v, SCHAR_MAX); 
+        return (schar) ::min((int)v, SCHAR_MAX);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(ushort v)
-    { 
+    {
-        return (schar) ::min((uint)v, (uint)SCHAR_MAX); 
+        return (schar) ::min((uint)v, (uint)SCHAR_MAX);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(int v)
    {
        return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ? v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(short v)
-    { 
+    {
-        return saturate_cast<schar>((int)v); 
+        return saturate_cast<schar>((int)v);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(uint v)
-    { 
+    {
-        return (schar) ::min(v, (uint)SCHAR_MAX); 
+        return (schar) ::min(v, (uint)SCHAR_MAX);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(float v)
-    { 
+    {
-        int iv = __float2int_rn(v); 
+        int iv = __float2int_rn(v);
-        return saturate_cast<schar>(iv); 
+        return saturate_cast<schar>(iv);
    }
    template<> __device__ __forceinline__ schar saturate_cast<schar>(double v)
-    {             
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<schar>(iv);
    #else
        return saturate_cast<schar>((float)v);
@@ -129,30 +129,30 @@ namespace cv { namespace gpu { namespace device
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(schar v)
-    { 
+    {
-        return (ushort) ::max((int)v, 0); 
+        return (ushort) ::max((int)v, 0);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(short v)
-    { 
+    {
-        return (ushort) ::max((int)v, 0); 
+        return (ushort) ::max((int)v, 0);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(int v)
-    { 
+    {
-        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); 
+        return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(uint v)
-    { 
+    {
-        return (ushort) ::min(v, (uint)USHRT_MAX); 
+        return (ushort) ::min(v, (uint)USHRT_MAX);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(float v)
    {
-        int iv = __float2int_rn(v); 
+        int iv = __float2int_rn(v);
-        return saturate_cast<ushort>(iv); 
+        return saturate_cast<ushort>(iv);
    }
    template<> __device__ __forceinline__ ushort saturate_cast<ushort>(double v)
-    {             
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<ushort>(iv);
    #else
        return saturate_cast<ushort>((float)v);
@@ -160,37 +160,37 @@ namespace cv { namespace gpu { namespace device
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(ushort v)
-    { 
+    {
-        return (short) ::min((int)v, SHRT_MAX); 
+        return (short) ::min((int)v, SHRT_MAX);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(int v)
    {
        return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ? v : v > 0 ? SHRT_MAX : SHRT_MIN);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(uint v)
-    { 
+    {
-        return (short) ::min(v, (uint)SHRT_MAX); 
+        return (short) ::min(v, (uint)SHRT_MAX);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(float v)
-    { 
+    {
-        int iv = __float2int_rn(v); 
+        int iv = __float2int_rn(v);
-        return saturate_cast<short>(iv); 
+        return saturate_cast<short>(iv);
    }
    template<> __device__ __forceinline__ short saturate_cast<short>(double v)
-    {            
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
-        int iv = __double2int_rn(v); 
+        int iv = __double2int_rn(v);
        return saturate_cast<short>(iv);
    #else
        return saturate_cast<short>((float)v);
    #endif
    }
-    template<> __device__ __forceinline__ int saturate_cast<int>(float v) 
+    template<> __device__ __forceinline__ int saturate_cast<int>(float v)
-    { 
+    {
-        return __float2int_rn(v); 
+        return __float2int_rn(v);
    }
-    template<> __device__ __forceinline__ int saturate_cast<int>(double v) 
+    template<> __device__ __forceinline__ int saturate_cast<int>(double v)
    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
        return __double2int_rn(v);
@@ -200,11 +200,11 @@ namespace cv { namespace gpu { namespace device
    }
    template<> __device__ __forceinline__ uint saturate_cast<uint>(float v)
-    { 
+    {
-        return __float2uint_rn(v); 
+        return __float2uint_rn(v);
    }
-    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v) 
+    template<> __device__ __forceinline__ uint saturate_cast<uint>(double v)
-    {            
+    {
    #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 130
        return __double2uint_rn(v);
    #else
--- a/modules/gpu/src/opencv2/gpu/device/scan.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/scan.hpp
@@ -0,0 +1,166 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors "as is" and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #ifndef __OPENCV_GPU_SCAN_HPP__
 #define __OPENCV_GPU_SCAN_HPP__
        enum ScanKind { EXCLUSIVE = 0,  INCLUSIVE = 1 };
        template <ScanKind Kind, typename T, typename F> struct WarpScan
        {
            __device__ __forceinline__ WarpScan() {}
            __device__ __forceinline__ WarpScan(const WarpScan& other) { (void)other; }
            __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
            {
                const unsigned int lane = idx & 31;
                F op;
                if ( lane >=  1) ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
                if ( lane >=  2) ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
                if ( lane >=  4) ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
                if ( lane >=  8) ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
                if ( lane >= 16) ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
                if( Kind == INCLUSIVE )
                    return ptr [idx];
                else
                    return (lane > 0) ? ptr [idx - 1] : 0;
            }
            __device__ __forceinline__ unsigned int index(const unsigned int tid)
            {
                return tid;
            }
            __device__ __forceinline__ void init(volatile T *ptr){}
            static const int warp_offset      = 0;
            typedef WarpScan<INCLUSIVE, T, F>  merge;
        };
        template <ScanKind Kind , typename T, typename F> struct WarpScanNoComp
        {
            __device__ __forceinline__ WarpScanNoComp() {}
            __device__ __forceinline__ WarpScanNoComp(const WarpScanNoComp& other) { (void)other; }
            __device__ __forceinline__ T operator()( volatile T *ptr , const unsigned int idx)
            {
                const unsigned int lane = threadIdx.x & 31;
                F op;
                ptr [idx ] = op(ptr [idx -  1], ptr [idx]);
                ptr [idx ] = op(ptr [idx -  2], ptr [idx]);
                ptr [idx ] = op(ptr [idx -  4], ptr [idx]);
                ptr [idx ] = op(ptr [idx -  8], ptr [idx]);
                ptr [idx ] = op(ptr [idx - 16], ptr [idx]);
                if( Kind == INCLUSIVE )
                    return ptr [idx];
                else
                    return (lane > 0) ? ptr [idx - 1] : 0;
            }
            __device__ __forceinline__ unsigned int index(const unsigned int tid)
            {
                return (tid >> warp_log) * warp_smem_stride + 16 + (tid & warp_mask);
            }
            __device__ __forceinline__ void init(volatile T *ptr)
            {
                ptr[threadIdx.x] = 0;
            }
            static const int warp_smem_stride = 32 + 16 + 1;
            static const int warp_offset      = 16;
            static const int warp_log         = 5;
            static const int warp_mask        = 31;
            typedef WarpScanNoComp<INCLUSIVE, T, F> merge;
        };
        template <ScanKind Kind , typename T, typename Sc, typename F> struct BlockScan
        {
            __device__ __forceinline__ BlockScan() {}
            __device__ __forceinline__ BlockScan(const BlockScan& other) { (void)other; }
            __device__ __forceinline__ T operator()(volatile T *ptr)
            {
                const unsigned int tid  = threadIdx.x;
                const unsigned int lane = tid & warp_mask;
                const unsigned int warp = tid >> warp_log;
                Sc scan;
                typename Sc::merge merge_scan;
                const unsigned int idx = scan.index(tid);
                T val = scan(ptr, idx);
                __syncthreads ();
                if( warp == 0)
                    scan.init(ptr);
                __syncthreads ();
                if( lane == 31 )
                    ptr [scan.warp_offset + warp ] = (Kind == INCLUSIVE) ? val : ptr [idx];
                __syncthreads ();
                if( warp == 0 )
                    merge_scan(ptr, idx);
                __syncthreads();
                if ( warp > 0)
                    val = ptr [scan.warp_offset + warp - 1] + val;
                __syncthreads ();
                ptr[idx] = val;
                __syncthreads ();
                return val ;
            }
            static const int warp_log  = 5;
            static const int warp_mask = 31;
        };
 #endif
--- a/modules/gpu/src/opencv2/gpu/device/static_check.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/static_check.hpp
@@ -43,27 +43,27 @@
 #ifndef __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
 #define __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__
-#if defined(__CUDACC__) 
+#if defined(__CUDACC__)
-    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 
+    #define __OPENCV_GPU_HOST_DEVICE__ __host__ __device__ __forceinline__
 #else
    #define __OPENCV_GPU_HOST_DEVICE__
-#endif  
+#endif
-namespace cv { namespace gpu 
+namespace cv { namespace gpu
-{ 
+{
    namespace device
    {
        template<bool expr> struct Static {};
-        
+
-        template<> struct Static<true> 
+        template<> struct Static<true>
-        { 
+        {
-            __OPENCV_GPU_HOST_DEVICE__ static void check() {}; 
+            __OPENCV_GPU_HOST_DEVICE__ static void check() {};
        };
-    }    
+    }
    using ::cv::gpu::device::Static;
 }}
 #undef __OPENCV_GPU_HOST_DEVICE__
-#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */ 
+#endif /* __OPENCV_GPU_GPU_DEVICE_STATIC_CHECK_HPP__ */
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -47,7 +47,7 @@
 #include "utility.hpp"
 #include "detail/transform_detail.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T, typename D, typename UnOp, typename Mask>
    static inline void transform(DevMem2D_<T> src, DevMem2D_<D> dst, UnOp op, const Mask& mask, cudaStream_t stream)
--- a/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/type_traits.hpp
@@ -45,11 +45,11 @@
 #include "detail/type_traits_detail.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct IsSimpleParameter
    {
-        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value || 
+        enum {value = type_traits_detail::IsIntegral<T>::value || type_traits_detail::IsFloat<T>::value ||
            type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<T>::type>::value};
    };
@@ -65,16 +65,16 @@ namespace cv { namespace gpu { namespace device
        enum { isVolatile       = type_traits_detail::UnVolatile<T>::value };
        enum { isReference      = type_traits_detail::ReferenceTraits<UnqualifiedType>::value };
-        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };        
+        enum { isPointer        = type_traits_detail::PointerTraits<typename type_traits_detail::ReferenceTraits<UnqualifiedType>::type>::value };
-        enum { isUnsignedInt = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
+        enum { isUnsignedInt    = type_traits_detail::IsUnsignedIntegral<UnqualifiedType>::value };
-        enum { isSignedInt   = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
+        enum { isSignedInt      = type_traits_detail::IsSignedIntergral<UnqualifiedType>::value };
-        enum { isIntegral    = type_traits_detail::IsIntegral<UnqualifiedType>::value };
+        enum { isIntegral       = type_traits_detail::IsIntegral<UnqualifiedType>::value };
-        enum { isFloat       = type_traits_detail::IsFloat<UnqualifiedType>::value  };
+        enum { isFloat          = type_traits_detail::IsFloat<UnqualifiedType>::value };
-        enum { isArith       = isIntegral || isFloat };
+        enum { isArith          = isIntegral || isFloat };
-        enum { isVec         = type_traits_detail::IsVec<UnqualifiedType>::value  };
+        enum { isVec            = type_traits_detail::IsVec<UnqualifiedType>::value };
-        
+
-        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value, 
+        typedef typename type_traits_detail::Select<IsSimpleParameter<UnqualifiedType>::value,
            T, typename type_traits_detail::AddParameterType<T>::type>::type ParameterType;
    };
 }}}
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -47,17 +47,17 @@
 #include "datamov_utils.hpp"
 #include "detail/utility_detail.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    #define OPENCV_GPU_LOG_WARP_SIZE	    (5)
+    #define OPENCV_GPU_LOG_WARP_SIZE        (5)
-    #define OPENCV_GPU_WARP_SIZE	        (1 << OPENCV_GPU_LOG_WARP_SIZE)
+    #define OPENCV_GPU_WARP_SIZE            (1 << OPENCV_GPU_LOG_WARP_SIZE)
    #define OPENCV_GPU_LOG_MEM_BANKS        ((__CUDA_ARCH__ >= 200) ? 5 : 4) // 32 banks on fermi, 16 on tesla
    #define OPENCV_GPU_MEM_BANKS            (1 << OPENCV_GPU_LOG_MEM_BANKS)
    ///////////////////////////////////////////////////////////////////////////////
    // swap
-    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b) 
+    template <typename T> void __device__ __host__ __forceinline__ swap(T& a, T& b)
    {
        const T temp = a;
        a = b;
@@ -71,9 +71,9 @@ namespace cv { namespace gpu { namespace device
    {
        explicit __host__ __device__ __forceinline__ SingleMask(PtrStepb mask_) : mask(mask_) {}
        __host__ __device__ __forceinline__ SingleMask(const SingleMask& mask_): mask(mask_.mask){}
-        
+
        __device__ __forceinline__ bool operator()(int y, int x) const
-        {            
+        {
            return mask.ptr(y)[x] != 0;
        }
@@ -82,13 +82,13 @@ namespace cv { namespace gpu { namespace device
    struct SingleMaskChannels
    {
-        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_) 
+        __host__ __device__ __forceinline__ SingleMaskChannels(PtrStepb mask_, int channels_)
        : mask(mask_), channels(channels_) {}
        __host__ __device__ __forceinline__ SingleMaskChannels(const SingleMaskChannels& mask_)
            :mask(mask_.mask), channels(mask_.channels){}
-        
+
        __device__ __forceinline__ bool operator()(int y, int x) const
-        {            
+        {
            return mask.ptr(y)[x / channels] != 0;
        }
@@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
        {
            curMask = maskCollection[z];
        }
-        
+
        __device__ __forceinline__ bool operator()(int y, int x) const
        {
            uchar val;
@@ -165,20 +165,20 @@ namespace cv { namespace gpu { namespace device
        utility_detail::ReductionDispatcher<n <= 64>::reduce<n>(data, partial_reduction, tid, op);
    }
-    template <int n, typename T, typename V, typename Pred> 
+    template <int n, typename T, typename V, typename Pred>
    __device__ __forceinline__ void reducePredVal(volatile T* sdata, T& myData, V* sval, V& myVal, int tid, const Pred& pred)
    {
        StaticAssert<n >= 8 && n <= 512>::check();
        utility_detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
    }
-    template <int n, typename T, typename V1, typename V2, typename Pred> 
+    template <int n, typename T, typename V1, typename V2, typename Pred>
    __device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
    {
        StaticAssert<n >= 8 && n <= 512>::check();
        utility_detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
    }
-    
+
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system
@@ -212,17 +212,17 @@ namespace cv { namespace gpu { namespace device
        {
            double invdet = 1.0 / det;
-            x[0] = saturate_cast<T>(invdet * 
+            x[0] = saturate_cast<T>(invdet *
                (b[0]    * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) -
                 A[0][1] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) +
                 A[0][2] * (b[1]    * A[2][1] - A[1][1] * b[2]   )));
-            x[1] = saturate_cast<T>(invdet * 
+            x[1] = saturate_cast<T>(invdet *
                (A[0][0] * (b[1]    * A[2][2] - A[1][2] * b[2]   ) -
                 b[0]    * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) +
                 A[0][2] * (A[1][0] * b[2]    - b[1]    * A[2][0])));
-            x[2] = saturate_cast<T>(invdet * 
+            x[2] = saturate_cast<T>(invdet *
                (A[0][0] * (A[1][1] * b[2]    - b[1]    * A[2][1]) -
                 A[0][1] * (A[1][0] * b[2]    - b[1]    * A[2][0]) +
                 b[0]    * (A[1][0] * A[2][1] - A[1][1] * A[2][0])));
--- a/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
@@ -47,7 +47,7 @@
 #include "functional.hpp"
 #include "detail/vec_distance_detail.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct L1Dist
    {
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
    };
    // calc distance between two vectors in global memory
-    template <int THREAD_DIM, typename Dist, typename T1, typename T2> 
+    template <int THREAD_DIM, typename Dist, typename T1, typename T2>
    __device__ void calcVecDiffGlobal(const T1* vec1, const T2* vec2, int len, Dist& dist, typename Dist::result_type* smem, int tid)
    {
        for (int i = tid; i < len; i += THREAD_DIM)
@@ -170,9 +170,9 @@ namespace cv { namespace gpu { namespace device
    // calc distance between two vectors, first vector is cached in register or shared memory, second vector is in global memory
    template <int THREAD_DIM, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T1, typename T2>
    __device__ __forceinline__ void calcVecDiffCached(const T1* vecCached, const T2* vecGlob, int len, Dist& dist, typename Dist::result_type* smem, int tid)
-    {        
+    {
        vec_distance_detail::VecDiffCachedCalculator<THREAD_DIM, MAX_LEN, LEN_EQ_MAX_LEN>::calc(vecCached, vecGlob, len, dist, tid);
-        
+
        dist.reduceAll<THREAD_DIM>(smem, tid);
    }
--- a/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_math.hpp
@@ -47,7 +47,7 @@
 #include "vec_traits.hpp"
 #include "functional.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace vec_math_detail
    {
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace device
    }
    namespace vec_math_detail
-    {    
+    {
        template <typename T1, typename T2> struct BinOpTraits
        {
            typedef int argument_type;
@@ -326,5 +326,5 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_VEC_OP
    #undef OPENCV_GPU_IMPLEMENT_VEC_INT_OP
 }}} // namespace cv { namespace gpu { namespace device
-        
+
 #endif // __OPENCV_GPU_VECMATH_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -45,7 +45,7 @@
 #include "common.hpp"
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    template<typename T, int N> struct TypeVec;
@@ -219,18 +219,18 @@ namespace cv { namespace gpu { namespace device
    #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
-    template<> struct VecTraits<char> 
+    template<> struct VecTraits<char>
-    { 
+    {
        typedef char elem_type;
-        enum {cn=1}; 
+        enum {cn=1};
        static __device__ __host__ __forceinline__ char all(char v) {return v;}
        static __device__ __host__ __forceinline__ char make(char x) {return x;}
        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
    };
-    template<> struct VecTraits<schar> 
+    template<> struct VecTraits<schar>
-    { 
+    {
        typedef schar elem_type;
-        enum {cn=1}; 
+        enum {cn=1};
        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
--- a/modules/gpu/src/opencv2/gpu/device/warp.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/warp.hpp
@@ -43,7 +43,7 @@
 #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
 #define __OPENCV_GPU_DEVICE_WARP_HPP__
-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    struct Warp
    {
@@ -64,18 +64,18 @@ namespace cv { namespace gpu { namespace device
        template<typename It, typename T>
        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
-        {                
+        {
            for(It t = beg + laneId(); t < end; t += STRIDE)
                *t = value;
-        }            
+        }
        template<typename InIt, typename OutIt>
        static __device__ __forceinline__ OutIt copy(InIt beg, InIt end, OutIt out)
-        {                
+        {
            for(InIt t = beg + laneId(); t < end; t += STRIDE, out += STRIDE)
                *out = *t;
            return out;
-        }            
+        }
        template<typename InIt, typename OutIt, class UnOp>
        static __device__ __forceinline__ OutIt transform(InIt beg, InIt end, OutIt out, UnOp op)
@@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
        {
            unsigned int lane = laneId();
-            InIt1 t1 = beg1 + lane; 
+            InIt1 t1 = beg1 + lane;
            InIt2 t2 = beg2 + lane;
            for(; t1 < end1; t1 += STRIDE, t2 += STRIDE, out += STRIDE)
                *out = op(*t1, *t2);
@@ -100,7 +100,7 @@ namespace cv { namespace gpu { namespace device
        template<typename OutIt, typename T>
        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
        {
-            unsigned int lane = laneId();                
+            unsigned int lane = laneId();
            value += lane;
            for(OutIt t = beg + lane; t < end; t += STRIDE, value += STRIDE)
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
@@ -44,7 +44,32 @@
 #ifndef HAVE_CUDA
-void cv::gpu::resize(const GpuMat&, GpuMat&, Size, double, double, int, Stream&) { throw_nogpu(); }
+void cv::gpu::resize(const GpuMat& src, GpuMat& dst, Size dsize, double fx, double fy, int interpolation, Stream& s)
 {
    (void)src;
    (void)dst;
    (void)dsize;
    (void)fx;
    (void)fy;
    (void)interpolation;
    (void)s;
    throw_nogpu();
 }
 void cv::gpu::resize(const GpuMat& src, GpuMat& dst,GpuMat& buffer, Size dsize,
            double fx, double fy, int interpolation, Stream& s)
 {
    (void)src;
    (void)dst;
    (void)dsize;
    (void)fx;
    (void)fy;
    (void)interpolation;
    (void)buffer;
    (void)s;
    throw_nogpu();
 }
 #else // HAVE_CUDA