fixed gpu::filter2D border interpolation for CV_32FC1 type

added additional tests for gpu filters fixed gpu features2D tests
2012-03-21 14:38:23 +00:00
parent c1a6cb6221
commit 059cef57e6
16 changed files with 1730 additions and 1515 deletions
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -46,16 +46,16 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/border_interpolate.hpp"

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
-    namespace imgproc 
+    namespace imgproc
    {
        /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////

        texture<uchar4, 2> tex_meanshift;

-        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out, 
-                                        size_t out_step, int cols, int rows, 
+        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
+                                        size_t out_step, int cols, int rows,
                                        int sp, int sr, int maxIter, float eps)
        {
            int isr2 = sr*sr;
@@ -78,7 +78,7 @@ namespace cv { namespace gpu { namespace device
                {
                    int rowCount = 0;
                    for( int x = minx; x <= maxx; x++ )
-                    {                    
+                    {
                        uchar4 t = tex2D( tex_meanshift, x, y );

                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
@@ -128,16 +128,16 @@ namespace cv { namespace gpu { namespace device
                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
        }

-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep, 
-                                             unsigned char* outsp, size_t outspstep, 
-                                             int cols, int rows, 
+        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
+                                             unsigned char* outsp, size_t outspstep,
+                                             int cols, int rows,
                                             int sp, int sr, int maxIter, float eps)
        {
            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
            int y0 = blockIdx.y * blockDim.y + threadIdx.y;

            if( x0 < cols && y0 < rows )
-            {            
+            {
                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
            }
@@ -159,10 +159,10 @@ namespace cv { namespace gpu { namespace device
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );

-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
        }

-        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream) 
+        void meanShiftProc_gpu(const DevMem2Db& src, DevMem2Db dstr, DevMem2Db dstsp, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
        {
            dim3 grid(1, 1, 1);
            dim3 threads(32, 8, 1);
@@ -178,14 +178,14 @@ namespace cv { namespace gpu { namespace device
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );

-            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );        
+            //cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
        }

        /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////

        template <typename T>
        __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
-        {        
+        {
            unsigned int H = ((ndisp-d) * 240)/ndisp;

            unsigned int hi = (H/60) % 6;
@@ -195,7 +195,7 @@ namespace cv { namespace gpu { namespace device
            float t = V * (1 - (1 - f) * S);

            float3 res;
-            
+
            if (hi == 0) //R = V,	G = t,	B = p
            {
                res.x = p;
@@ -208,15 +208,15 @@ namespace cv { namespace gpu { namespace device
                res.x = p;
                res.y = V;
                res.z = q;
-            }        
-            
+            }
+
            if (hi == 2) // R = p,	G = V,	B = t
            {
                res.x = t;
                res.y = V;
                res.z = p;
            }
-                
+
            if (hi == 3) // R = p,	G = q,	B = V
            {
                res.x = V;
@@ -242,15 +242,15 @@ namespace cv { namespace gpu { namespace device
            const unsigned int r = (unsigned int)(::max(0.f, ::min(res.z, 1.f)) * 255.f);
            const unsigned int a = 255U;

-            return (a << 24) + (r << 16) + (g << 8) + b;    
-        } 
+            return (a << 24) + (r << 16) + (g << 8) + b;
+        }

        __global__ void drawColorDisp(uchar* disp, size_t disp_step, uchar* out_image, size_t out_step, int width, int height, int ndisp)
        {
            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if(x < width && y < height) 
+            if(x < width && y < height)
            {
                uchar4 d4 = *(uchar4*)(disp + y * disp_step + x);

@@ -259,7 +259,7 @@ namespace cv { namespace gpu { namespace device
                res.y = cvtPixel(d4.y, ndisp);
                res.z = cvtPixel(d4.z, ndisp);
                res.w = cvtPixel(d4.w, ndisp);
-                        
+
                uint4* line = (uint4*)(out_image + y * out_step);
                line[x >> 2] = res;
            }
@@ -270,12 +270,12 @@ namespace cv { namespace gpu { namespace device
            const int x = (blockIdx.x * blockDim.x + threadIdx.x) << 1;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if(x < width && y < height) 
+            if(x < width && y < height)
            {
                short2 d2 = *(short2*)(disp + y * disp_step + x);

                uint2 res;
-                res.x = cvtPixel(d2.x, ndisp);            
+                res.x = cvtPixel(d2.x, ndisp);
                res.y = cvtPixel(d2.y, ndisp);

                uint2* line = (uint2*)(out_image + y * out_step);
@@ -290,12 +290,12 @@ namespace cv { namespace gpu { namespace device
            dim3 grid(1, 1, 1);
            grid.x = divUp(src.cols, threads.x << 2);
            grid.y = divUp(src.rows, threads.y);
-             
+
            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
            cudaSafeCall( cudaGetLastError() );

            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() ); 
+                cudaSafeCall( cudaDeviceSynchronize() );
        }

        void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2Db& dst, int ndisp, const cudaStream_t& stream)
@@ -304,10 +304,10 @@ namespace cv { namespace gpu { namespace device
            dim3 grid(1, 1, 1);
            grid.x = divUp(src.cols, threads.x << 1);
            grid.y = divUp(src.rows, threads.y);
-             
+
            drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
            cudaSafeCall( cudaGetLastError() );
-            
+
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
@@ -318,7 +318,7 @@ namespace cv { namespace gpu { namespace device

        template <typename T>
        __global__ void reprojectImageTo3D(const T* disp, size_t disp_step, float* xyzw, size_t xyzw_step, int rows, int cols)
-        {        
+        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

@@ -328,7 +328,7 @@ namespace cv { namespace gpu { namespace device
                float qx = cq[1] * y + cq[3], qy = cq[5] * y + cq[7];
                float qz = cq[9] * y + cq[11], qw = cq[13] * y + cq[15];

-                qx += x * cq[0]; 
+                qx += x * cq[0];
                qy += x * cq[4];
                qz += x * cq[8];
                qw += x * cq[12];
@@ -457,7 +457,7 @@ namespace cv { namespace gpu { namespace device
            bindTexture(&harrisDxTex, Dx);
            bindTexture(&harrisDyTex, Dy);

-            switch (border_type) 
+            switch (border_type)
            {
            case BORDER_REFLECT101_GPU:
                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
@@ -565,7 +565,7 @@ namespace cv { namespace gpu { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-            
+
            bindTexture(&minEigenValDxTex, Dx);
            bindTexture(&minEigenValDyTex, Dy);

@@ -630,10 +630,10 @@ namespace cv { namespace gpu { namespace device

        __global__ void mulSpectrumsKernel(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;    
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;    
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
            }
@@ -658,10 +658,10 @@ namespace cv { namespace gpu { namespace device

        __global__ void mulSpectrumsKernel_CONJ(const PtrStep<cufftComplex> a, const PtrStep<cufftComplex> b, DevMem2D_<cufftComplex> c)
        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;    
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;    
+            const int x = blockIdx.x * blockDim.x + threadIdx.x;
+            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                c.ptr(y)[x] = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
            }
@@ -689,7 +689,7 @@ namespace cv { namespace gpu { namespace device
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                cufftComplex v = cuCmulf(a.ptr(y)[x], b.ptr(y)[x]);
                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
@@ -718,7 +718,7 @@ namespace cv { namespace gpu { namespace device
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x < c.cols && y < c.rows) 
+            if (x < c.cols && y < c.rows)
            {
                cufftComplex v = cuCmulf(a.ptr(y)[x], cuConjf(b.ptr(y)[x]));
                c.ptr(y)[x] = make_cuFloatComplex(cuCrealf(v) * scale, cuCimagf(v) * scale);
@@ -736,7 +736,7 @@ namespace cv { namespace gpu { namespace device

            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-        }    
+        }

        //////////////////////////////////////////////////////////////////////////
        // buildWarpMaps
@@ -842,7 +842,7 @@ namespace cv { namespace gpu { namespace device


        void buildWarpPlaneMaps(int tl_u, int tl_v, DevMem2Df map_x, DevMem2Df map_y,
-                                const float k_rinv[9], const float r_kinv[9], const float t[3], 
+                                const float k_rinv[9], const float r_kinv[9], const float t[3],
                                float scale, cudaStream_t stream)
        {
            cudaSafeCall(cudaMemcpyToSymbol(build_warp_maps::ck_rinv, k_rinv, 9*sizeof(float)));
@@ -911,27 +911,28 @@ namespace cv { namespace gpu { namespace device

        __constant__ float c_filter2DKernel[FILTER2D_MAX_KERNEL_SIZE * FILTER2D_MAX_KERNEL_SIZE];

-        texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeBorder);
+        texture<float, cudaTextureType2D, cudaReadModeElementType> filter2DTex(0, cudaFilterModePoint, cudaAddressModeClamp);

-        __global__ void filter2D(int ofsX, int ofsY, DevMem2Df dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY)
+        __global__ void filter2D(int ofsX, int ofsY, PtrStepf dst, const int kWidth, const int kHeight, const int anchorX, const int anchorY, const BrdReflect101<float> brd)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;

-            if (x >= dst.cols || y >= dst.rows)
+            if (x > brd.last_col || y > brd.last_row)
                return;

            float res = 0;
-
-            const int baseX = ofsX + x - anchorX;
-            const int baseY = ofsY + y - anchorY;
-
            int kInd = 0;

            for (int i = 0; i < kHeight; ++i)
            {
                for (int j = 0; j < kWidth; ++j)
-                    res += tex2D(filter2DTex, baseX + j, baseY + i) * c_filter2DKernel[kInd++];
+                {
+                    const int srcX = ofsX + brd.idx_col(x - anchorX + j);
+                    const int srcY = ofsY + brd.idx_row(y - anchorY + i);
+
+                    res += tex2D(filter2DTex, srcX, srcY) * c_filter2DKernel[kInd++];
+                }
            }

            dst.ptr(y)[x] = res;
@@ -946,7 +947,9 @@ namespace cv { namespace gpu { namespace device

            bindTexture(&filter2DTex, src);

-            filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY);
+            BrdReflect101<float> brd(dst.rows, dst.cols);
+
+            filter2D<<<grid, block, 0, stream>>>(ofsX, ofsY, dst, kWidth, kHeight, anchorX, anchorY, brd);
            cudaSafeCall(cudaGetLastError());

            if (stream == 0)
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
@@ -119,7 +119,7 @@ namespace
    {
        int scale = nDivisor && (kernel.depth() == CV_32F || kernel.depth() == CV_64F) ? 256 : 1;
        if (nDivisor) *nDivisor = scale;
-        
+
        Mat temp(kernel.size(), type);
        kernel.convertTo(temp, type, scale);
        Mat cont_krnl = temp.reshape(1, 1);
@@ -134,7 +134,7 @@ namespace
        }

        gpu_krnl.upload(cont_krnl);
-    } 
+    }
 }

 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -144,7 +144,7 @@ namespace
 {
    struct Filter2DEngine_GPU : public FilterEngine_GPU
    {
-        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) : 
+        Filter2DEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int srcType_, int dstType_) :
            filter2D(filter2D_), srcType(srcType_), dstType(dstType_)
        {}

@@ -189,9 +189,9 @@ namespace
 {
    struct SeparableFilterEngine_GPU : public FilterEngine_GPU
    {
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_, 
+        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
                                  int srcType_, int bufType_, int dstType_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_), 
+            rowFilter(rowFilter_), columnFilter(columnFilter_),
            srcType(srcType_), bufType(bufType_), dstType(dstType_)
        {
            ksize = Size(rowFilter->ksize, columnFilter->ksize);
@@ -199,11 +199,11 @@ namespace

            pbuf = &buf;
        }
-        
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_, 
+
+        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, const Ptr<BaseColumnFilter_GPU>& columnFilter_,
                                  int srcType_, int bufType_, int dstType_,
                                  GpuMat& buf_) :
-            rowFilter(rowFilter_), columnFilter(columnFilter_), 
+            rowFilter(rowFilter_), columnFilter(columnFilter_),
            srcType(srcType_), bufType(bufType_), dstType(dstType_)
        {
            ksize = Size(rowFilter->ksize, columnFilter->ksize);
@@ -235,7 +235,7 @@ namespace
            GpuMat srcROI = src(roi);
            GpuMat dstROI = dst(roi);
            GpuMat bufROI = (*pbuf)(roi);
-            
+
            (*rowFilter)(srcROI, bufROI, stream);
            (*columnFilter)(bufROI, dstROI, stream);
        }
@@ -253,13 +253,13 @@ namespace
    };
 }

-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType)
 {
    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType));
 }

-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter,
    const Ptr<BaseColumnFilter_GPU>& columnFilter, int srcType, int bufType, int dstType, GpuMat& buf)
 {
    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf));
@@ -284,7 +284,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+            nppSafeCall( nppiSumWindowRow_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );

            if (stream == 0)
@@ -318,7 +318,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+            nppSafeCall( nppiSumWindowColumn_8u32f_C1R(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp32f>(), static_cast<int>(dst.step), sz, ksize, anchor) );

            if (stream == 0)
@@ -341,7 +341,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int sumType, int dstTy

 namespace
 {
-    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, 
+    typedef NppStatus (*nppFilterBox_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
        NppiSize oMaskSize, NppiPoint oAnchor);

    struct NPPBoxFilter : public BaseFilter_GPU
@@ -363,8 +363,8 @@ namespace
            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);
-            
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );

            if (stream == 0)
@@ -379,7 +379,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int srcType, int dstType, const Si
 {
    static const nppFilterBox_t nppFilterBox_callers[] = {0, nppiFilterBox_8u_C1R, 0, 0, nppiFilterBox_8u_C4R};

-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); 
+    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);

    normalizeAnchor(anchor, ksize);

@@ -413,7 +413,7 @@ namespace

    struct NPPMorphFilter : public BaseFilter_GPU
    {
-        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) : 
+        NPPMorphFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, nppMorfFilter_t func_) :
            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@@ -432,7 +432,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), 
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step),
                dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, kernel.ptr<Npp8u>(), oKernelSize, oAnchor) );

            if (stream == 0)
@@ -446,19 +446,19 @@ namespace

 Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
 {
-    static const nppMorfFilter_t nppMorfFilter_callers[2][5] = 
+    static const nppMorfFilter_t nppMorfFilter_callers[2][5] =
    {
        {0, nppiErode_8u_C1R, 0, 0, nppiErode_8u_C4R },
        {0, nppiDilate_8u_C1R, 0, 0, nppiDilate_8u_C4R }
    };
- 
-    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);   
-    CV_Assert(type == CV_8UC1 || type == CV_8UC4); 
-        
+
+    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);
+    CV_Assert(type == CV_8UC1 || type == CV_8UC4);
+
    GpuMat gpu_krnl;
    normalizeKernel(kernel, gpu_krnl);
    normalizeAnchor(anchor, ksize);
-    
+
    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
 }

@@ -466,13 +466,13 @@ namespace
 {
    struct MorphologyFilterEngine_GPU : public FilterEngine_GPU
    {
-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) : 
+        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_) :
            filter2D(filter2D_), type(type_), iters(iters_)
        {
            pbuf = &buf;
        }

-        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) : 
+        MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU>& filter2D_, int type_, int iters_, GpuMat& buf_) :
            filter2D(filter2D_), type(type_), iters(iters_)
        {
            pbuf = &buf_;
@@ -576,7 +576,7 @@ namespace
        else if (iterations > 1 && countNonZero(_kernel) == _kernel.rows * _kernel.cols)
        {
            anchor = Point(anchor.x * iterations, anchor.y * iterations);
-            kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + iterations * (ksize.width - 1), 
+            kernel = getStructuringElement(MORPH_RECT, Size(ksize.width + iterations * (ksize.width - 1),
                ksize.height + iterations * (ksize.height - 1)), anchor);
            iterations = 1;
        }
@@ -659,7 +659,7 @@ void cv::gpu::morphologyEx(const GpuMat& src, GpuMat& dst, int op, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Linear Filter

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
@@ -669,12 +669,12 @@ namespace cv { namespace gpu { namespace device

 namespace
 {
-    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, 
+    typedef NppStatus (*nppFilter2D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
        const Npp32s * pKernel, NppiSize oKernelSize, NppiPoint oAnchor, Npp32s nDivisor);

    struct NPPLinearFilter : public BaseFilter_GPU
    {
-        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) : 
+        NPPLinearFilter(const Size& ksize_, const Point& anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter2D_t func_) :
            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@@ -692,8 +692,8 @@ namespace
            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);
-                                  
-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 
+
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
                kernel.ptr<Npp32s>(), oKernelSize, oAnchor, nDivisor) );

            if (stream == 0)
@@ -707,9 +707,9 @@ namespace

    struct GpuLinearFilter : public BaseFilter_GPU
    {
-        GpuLinearFilter(Size ksize_, Point anchor_, const GpuMat& kernel_) : 
+        GpuLinearFilter(Size ksize_, Point anchor_, const GpuMat& kernel_) :
            BaseFilter_GPU(ksize_, anchor_), kernel(kernel_) {}
-            
+
        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())
        {
            using namespace cv::gpu::device::imgproc;
@@ -745,7 +745,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const
    else
    {
        static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
-    
+
        GpuMat gpu_krnl;
        int nDivisor;
        normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
@@ -753,8 +753,8 @@ Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const
        normalizeAnchor(anchor, ksize);

        return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
-    }    
-}    
+    }
+}

 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor)
 {
@@ -780,7 +780,7 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace row_filter
    {
@@ -797,14 +797,14 @@ namespace cv { namespace gpu { namespace device

 namespace
 {
-    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, 
+    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI,
        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);

    typedef void (*gpuFilter1D_t)(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);

    struct NppLinearRowFilter : public BaseRowFilter_GPU
    {
-        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : 
+        NppLinearRowFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@@ -817,7 +817,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );

            if (stream == 0)
@@ -831,7 +831,7 @@ namespace

    struct GpuLinearRowFilter : public BaseRowFilter_GPU
    {
-        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : 
+        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@@ -852,7 +852,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
    using namespace ::cv::gpu::device::row_filter;

    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
-    
+
    if ((bufType == srcType) && (srcType == CV_8UC1 || srcType == CV_8UC4))
    {
        CV_Assert(borderType == BORDER_CONSTANT);
@@ -867,7 +867,7 @@ Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType,
        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
            nppFilter1D_callers[CV_MAT_CN(srcType)]));
    }
-    
+
    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
@@ -914,7 +914,7 @@ namespace
 {
    struct NppLinearColumnFilter : public BaseColumnFilter_GPU
    {
-        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) : 
+        NppLinearColumnFilter(int ksize_, int anchor_, const GpuMat& kernel_, Npp32s nDivisor_, nppFilter1D_t func_) :
            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), nDivisor(nDivisor_), func(func_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@@ -927,7 +927,7 @@ namespace

            NppStreamHandler h(stream);

-            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, 
+            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz,
                kernel.ptr<Npp32s>(), ksize, anchor, nDivisor) );

            if (stream == 0)
@@ -941,7 +941,7 @@ namespace

    struct GpuLinearColumnFilter : public BaseColumnFilter_GPU
    {
-        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) : 
+        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_, int brd_type_) :
            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_), brd_type(brd_type_) {}

        virtual void operator()(const GpuMat& src, GpuMat& dst, Stream& s = Stream::Null())
@@ -963,7 +963,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    using namespace ::cv::gpu::device::column_filter;

    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
-    
+
    if ((bufType == dstType) && (bufType == CV_8UC1 || bufType == CV_8UC4))
    {
        CV_Assert(borderType == BORDER_CONSTANT);
@@ -975,14 +975,14 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
        int ksize = gpu_col_krnl.cols;
        normalizeAnchor(anchor, ksize);

-        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, 
+        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor,
            nppFilter1D_callers[CV_MAT_CN(bufType)]));
    }
-    
+
    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT || borderType == BORDER_REFLECT || borderType == BORDER_WRAP);
    int gpuBorderType;
    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
-   
+
    CV_Assert(dstType == CV_8UC1 || dstType == CV_8UC4 || dstType == CV_16SC3 || dstType == CV_32SC1 || dstType == CV_32FC1);

    CV_Assert(CV_MAT_DEPTH(bufType) == CV_32F && CV_MAT_CN(dstType) == CV_MAT_CN(bufType));
@@ -1021,7 +1021,7 @@ Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int ds
    return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, func, gpuBorderType));
 }

-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, 
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel,
    const Point& anchor, int rowBorderType, int columnBorderType)
 {
    if (columnBorderType < 0)
@@ -1037,7 +1037,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int
    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType);
 }

-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf, 
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, GpuMat& buf,
    const Point& anchor, int rowBorderType, int columnBorderType)
 {
    if (columnBorderType < 0)
@@ -1053,7 +1053,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int
    return createSeparableFilter_GPU(rowFilter, columnFilter, srcType, bufType, dstType, buf);
 }

-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, 
+void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY,
                          Point anchor, int rowBorderType, int columnBorderType)
 {
    if( ddepth < 0 )
@@ -1065,7 +1065,7 @@ void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat&
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
 }

-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf, 
+void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, GpuMat& buf,
                          Point anchor, int rowBorderType, int columnBorderType,
                          Stream& stream)
 {
@@ -1115,7 +1115,7 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
        else
            ky *= scale;
    }
-    
+
    sepFilter2D(src, dst, ddepth, kx, ky, buf, Point(-1,-1), rowBorderType, columnBorderType, stream);
 }

@@ -1155,7 +1155,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
    Mat kernel(3, 3, CV_32S, (void*)K[ksize == 3]);
    if (scale != 1)
        kernel *= scale;
-    
+
    filter2D(src, dst, ddepth, kernel, Point(-1,-1), stream);
 }

@@ -1163,7 +1163,7 @@ void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, d
 // Gaussian Filter

 Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{        
+{
    int depth = CV_MAT_DEPTH(type);

    if (sigma2 <= 0)
@@ -1191,7 +1191,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, do
 }

 Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int type, Size ksize, GpuMat& buf, double sigma1, double sigma2, int rowBorderType, int columnBorderType)
-{        
+{
    int depth = CV_MAT_DEPTH(type);

    if (sigma2 <= 0)
@@ -1227,7 +1227,7 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, double si
    }

    dst.create(src.size(), src.type());
-    
+
    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, rowBorderType, columnBorderType);
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows));
 }
@@ -1241,7 +1241,7 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& b
    }

    dst.create(src.size(), src.type());
-    
+
    Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, buf, sigma1, sigma2, rowBorderType, columnBorderType);
    f->apply(src, dst, Rect(0, 0, src.cols, src.rows), stream);
 }
@@ -1251,7 +1251,7 @@ void cv::gpu::GaussianBlur(const GpuMat& src, GpuMat& dst, Size ksize, GpuMat& b

 namespace
 {
-    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI, 
+    typedef NppStatus (*nppFilterRank_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oSizeROI,
        NppiSize oMaskSize, NppiPoint oAnchor);

    struct NPPRankFilter : public BaseFilter_GPU
@@ -1273,7 +1273,7 @@ namespace
            cudaStream_t stream = StreamAccessor::getStream(s);

            NppStreamHandler h(stream);
-            
+
            nppSafeCall( func(src.ptr<Npp8u>(), static_cast<int>(src.step), dst.ptr<Npp8u>(), static_cast<int>(dst.step), sz, oKernelSize, oAnchor) );

            if (stream == 0)
@@ -1288,7 +1288,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int srcType, int dstType, const Si
 {
    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMax_8u_C1R, 0, 0, nppiFilterMax_8u_C4R};

-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); 
+    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);

    normalizeAnchor(anchor, ksize);

@@ -1299,7 +1299,7 @@ Ptr<BaseFilter_GPU> cv::gpu::getMinFilter_GPU(int srcType, int dstType, const Si
 {
    static const nppFilterRank_t nppFilterRank_callers[] = {0, nppiFilterMin_8u_C1R, 0, 0, nppiFilterMin_8u_C4R};

-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); 
+    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);

    normalizeAnchor(anchor, ksize);

--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
@@ -63,7 +63,7 @@ void cv::gpu::SURF_GPU::releaseMemory() { throw_nogpu(); }

 #else /* !defined (HAVE_CUDA) */

-namespace cv { namespace gpu { namespace device 
+namespace cv { namespace gpu { namespace device
 {
    namespace surf
    {
@@ -79,13 +79,13 @@ namespace cv { namespace gpu { namespace device
        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);

-        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter, 
-            float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian, 
+        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
+            float* featureX, float* featureY, int* featureLaplacian, float* featureSize, float* featureHessian,
            unsigned int* featureCounter);

        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);

-        void compute_descriptors_gpu(const DevMem2Df& descriptors, 
+        void compute_descriptors_gpu(const DevMem2Df& descriptors,
            const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}
@@ -108,7 +108,7 @@ namespace

        return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
    }
-    
+
    class SURF_GPU_Invoker
    {
    public:
@@ -121,11 +121,11 @@ namespace
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
            CV_Assert(surf_.nOctaves > 0 && surf_.nOctaveLayers > 0);
            CV_Assert(TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS));
-                
+
            const int min_size = calcSize(surf_.nOctaves - 1, 0);
            CV_Assert(img_rows - min_size >= 0);
            CV_Assert(img_cols - min_size >= 0);
-            
+
            const int layer_rows = img_rows >> (surf_.nOctaves - 1);
            const int layer_cols = img_cols >> (surf_.nOctaves - 1);
            const int min_margin = ((calcSize((surf_.nOctaves - 1), 2) >> 1) >> (surf_.nOctaves - 1)) + 1;
@@ -159,7 +159,7 @@ namespace
        {
            ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.det);
            ensureSizeIsEnough(img_rows * (surf_.nOctaveLayers + 2), img_cols, CV_32FC1, surf_.trace);
-            
+
            ensureSizeIsEnough(1, maxCandidates, CV_32SC4, surf_.maxPosBuffer);
            ensureSizeIsEnough(SURF_GPU::SF_FEATURE_STRIDE, maxFeatures, CV_32FC1, keypoints);
            keypoints.setTo(Scalar::all(0));
@@ -182,7 +182,7 @@ namespace

                if (maxCounter > 0)
                {
-                    icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter, 
+                    icvInterpolateKeypoint_gpu(surf_.det, surf_.maxPosBuffer.ptr<int4>(), maxCounter,
                        keypoints.ptr<float>(SURF_GPU::SF_X), keypoints.ptr<float>(SURF_GPU::SF_Y),
                        keypoints.ptr<int>(SURF_GPU::SF_LAPLACIAN), keypoints.ptr<float>(SURF_GPU::SF_SIZE),
                        keypoints.ptr<float>(SURF_GPU::SF_HESSIAN), counters.ptr<unsigned int>());
@@ -238,7 +238,7 @@ namespace
 cv::gpu::SURF_GPU::SURF_GPU()
 {
    hessianThreshold = 100;
-    extended = 1;
+    extended = true;
    nOctaves = 4;
    nOctaveLayers = 2;
    keypointsRatio = 0.01f;
@@ -323,9 +323,9 @@ void cv::gpu::SURF_GPU::downloadKeypoints(const GpuMat& keypointsGPU, vector<Key
    else
    {
        CV_Assert(keypointsGPU.type() == CV_32FC1 && keypointsGPU.rows == SF_FEATURE_STRIDE);
-        
+
        Mat keypointsCPU(keypointsGPU);
-        
+
        keypoints.resize(nFeatures);

        float* kp_x = keypointsCPU.ptr<float>(SF_X);
@@ -373,13 +373,13 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat
    }
 }

-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors, 
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, GpuMat& keypoints, GpuMat& descriptors,
                                   bool useProvidedKeypoints)
 {
    if (!img.empty())
    {
        SURF_GPU_Invoker surf(*this, img, mask);
-    
+
        if (!useProvidedKeypoints)
            surf.detectKeypoints(keypoints);
        else if (!upright)
@@ -400,20 +400,20 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
    downloadKeypoints(keypointsGPU, keypoints);
 }

-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, 
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
    GpuMat& descriptors, bool useProvidedKeypoints)
 {
    GpuMat keypointsGPU;

    if (useProvidedKeypoints)
-        uploadKeypoints(keypoints, keypointsGPU);    
+        uploadKeypoints(keypoints, keypointsGPU);

    (*this)(img, mask, keypointsGPU, descriptors, useProvidedKeypoints);

    downloadKeypoints(keypointsGPU, keypoints);
 }

-void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints, 
+void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector<KeyPoint>& keypoints,
    vector<float>& descriptors, bool useProvidedKeypoints)
 {
    GpuMat descriptorsGPU;
@@ -423,9 +423,9 @@ void cv::gpu::SURF_GPU::operator()(const GpuMat& img, const GpuMat& mask, vector
    downloadDescriptors(descriptorsGPU, descriptors);
 }

-void cv::gpu::SURF_GPU::releaseMemory() 
+void cv::gpu::SURF_GPU::releaseMemory()
 {
-    sum.release(); 
+    sum.release();
    mask1.release();
    maskSum.release();
    intBuffer.release();